Merge pull request #960 from PaddlePaddle/paddlespeech

[paddlespeech] merge deepspeech, parakeet and text_processing into paddlespeech

Merge pull request #960 from PaddlePaddle/paddlespeech
[paddlespeech] merge deepspeech, parakeet and text_processing into paddlespeech
58b24aa4 · Hui Zhang · GitHub · a0142bec · 0cd546d1 · 58b24aa4
1000 changed file
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -41,13 +41,13 @@ pull_request_rules:
        remove: ["conflicts"]
  - name: "auto add label=S2T"
    conditions:
-      - files~=^deepspeech/
+      - files~=^paddlespeech/s2t/
    actions:
      label:
        add: ["S2T"]
  - name: "auto add label=T2S"
    conditions:
-      - files~=^parakeet/
+      - files~=^paddlespeech/t2s/
    actions:
      label:
        add: ["T2S"]
@@ -59,7 +59,7 @@ pull_request_rules:
        add: ["Audio"]
  - name: "auto add label=TextProcess"
    conditions:
-      - files~=^text_processing/
+      - files~=^paddlespeech/text/
    actions:
      label:
        add: ["TextProcess"]

--- a/.travis/install.sh
+++ b/.travis/install.sh
-#!/bin/bash
-
-setup_env(){
-    cd tools && make && cd - 
-}
-
-install(){
-    if [ -f "setup.sh" ]; then
-        bash setup.sh
-        #export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-    fi
-    if [ $? != 0 ]; then
-        exit 1
-    fi
-}
-
-print_env(){
-    cat /etc/lsb-release
-    gcc -v
-    g++ -v
-}
-
-abort(){
-    echo "Run install failed" 1>&2
-    echo "Please check your code" 1>&2
-    exit 1
-}
-
-trap 'abort' 0
-set -e
-
-print_env
-setup_env
-source tools/venv/bin/activate
-install
-
-trap : 0
--- a/.travis/precommit.sh
+++ b/.travis/precommit.sh
-#!/bin/bash
-
-function abort(){
-    echo "Your commit not fit PaddlePaddle code style" 1>&2
-    echo "Please use pre-commit scripts to auto-format your code" 1>&2
-    exit 1
-}
-
-
-trap 'abort' 0
-set -e
-
-source tools/venv/bin/activate
-
-python3 --version
-
-if ! pre-commit run -a ; then
-  ls -lh
-  git diff  --exit-code
-  exit 1
-fi
-
-trap : 0
--- a/.travis/unittest.sh
+++ b/.travis/unittest.sh
-#!/bin/bash
-
-
-
-abort(){
-    echo "Run unittest failed" 1>&2
-    echo "Please check your code" 1>&2
-    exit 1
-}
-
-
-unittest(){
-    cd $1 > /dev/null
-    if [ -f "setup.sh" ]; then
-        bash setup.sh
-        export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-    fi
-    if [ $? != 0 ]; then
-        exit 1
-    fi
-    find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
-        xargs -0 -I{} -n1 bash -c \
-        'python3 -m unittest discover -v -s {}'
-    cd - > /dev/null
-}
-
-coverage(){
-    cd $1 > /dev/null
-
-    if [ -f "setup.sh" ]; then
-        bash setup.sh
-        export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
-    fi
-    if [ $? != 0 ]; then
-        exit 1
-    fi
-
-    find . -path ./tools/venv -prune -false -o -name 'tests' -type d -print0 | \
-        xargs -0 -I{} -n1 bash -c \
-        'python3 -m coverage run --branch {}'
-    python3 -m coverage report -m
-    python3 -m coverage html
-    cd - > /dev/null
-}
-
-trap 'abort' 0
-set -e
-
-source tools/venv/bin/activate
-#pip3 install pytest
-#unittest .
-coverage .
-
-trap : 0
--- a/paddleaudio/.gitignore
+++ b/paddleaudio/.gitignore
--- a/paddleaudio/.pre-commit-config.yaml
+++ b/paddleaudio/.pre-commit-config.yaml
--- a/paddleaudio/.style.yapf
+++ b/paddleaudio/.style.yapf
--- a/paddleaudio/LICENSE
+++ b/paddleaudio/LICENSE
--- a/paddleaudio/README.md
+++ b/paddleaudio/README.md
--- a/paddleaudio/examples/panns/README.md
+++ b/paddleaudio/examples/panns/README.md
--- a/paddleaudio/examples/panns/assets/audioset_labels.txt
+++ b/paddleaudio/examples/panns/assets/audioset_labels.txt
--- a/audio/examples/panns/audio_tag.py
+++ b/audio/examples/panns/audio_tag.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+from typing import List
+
+import numpy as np
+import paddle
+from paddleaudio.backends import load as load_audio
+from paddleaudio.features import melspectrogram
+from paddleaudio.models.panns import cnn14
+from paddleaudio.utils import logger
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.')
+parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.')
+parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.')
+parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.')
+parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.')
+args = parser.parse_args()
+# yapf: enable
+
+
+def split(waveform: np.ndarray, win_size: int, hop_size: int):
+    """
+    Split into N waveforms.
+    N is decided by win_size and hop_size.
+    """
+    assert isinstance(waveform, np.ndarray)
+    time = []
+    data = []
+    for i in range(0, len(waveform), hop_size):
+        segment = waveform[i:i + win_size]
+        if len(segment) < win_size:
+            segment = np.pad(segment, (0, win_size - len(segment)))
+        data.append(segment)
+        time.append(i / len(waveform))
+    return time, data
+
+
+def batchify(data: List[List[float]],
+             sample_rate: int,
+             batch_size: int,
+             **kwargs):
+    """
+    Extract features from waveforms and create batches.
+    """
+    examples = []
+    for waveform in data:
+        feats = melspectrogram(waveform, sample_rate, **kwargs).transpose()
+        examples.append(feats)
+
+    # Seperates data into some batches.
+    one_batch = []
+    for example in examples:
+        one_batch.append(example)
+        if len(one_batch) == batch_size:
+            yield one_batch
+            one_batch = []
+    if one_batch:
+        yield one_batch
+
+
+def predict(model, data: List[List[float]], sample_rate: int,
+            batch_size: int=1):
+    """
+    Use pretrained model to make predictions.
+    """
+    batches = batchify(data, sample_rate, batch_size)
+    results = None
+    model.eval()
+    for batch in batches:
+        feats = paddle.to_tensor(batch).unsqueeze(1)  \
+            # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins)
+
+        audioset_scores = model(feats)
+        if results is None:
+            results = audioset_scores.numpy()
+        else:
+            results = np.concatenate((results, audioset_scores.numpy()))
+
+    return results
+
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+    model = cnn14(pretrained=True, extract_embedding=False)
+    waveform, sr = load_audio(args.wav, sr=None)
+    time, data = split(waveform,
+                       int(args.sample_duration * sr),
+                       int(args.hop_duration * sr))
+    results = predict(model, data, sr, batch_size=8)
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform))
+    output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz')
+    np.savez(output_file, time=time, scores=results)
+    logger.info(f'Saved tagging results to {output_file}')
--- a/audio/examples/panns/parse_result.py
+++ b/audio/examples/panns/parse_result.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import ast
+import os
+from typing import Dict
+
+import numpy as np
+from paddleaudio.utils import logger
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--tagging_file', type=str, required=True, help='')
+parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.')
+parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.')
+parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.')
+parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.')
+parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.')
+args = parser.parse_args()
+# yapf: enable
+
+
+def smooth(results: np.ndarray, win_size: int):
+    """
+    Execute posterior smoothing in-place.
+    """
+    for i in range(len(results) - 1, -1, -1):
+        if i < win_size - 1:
+            left = 0
+        else:
+            left = i + 1 - win_size
+        results[i] = np.sum(results[left:i + 1], axis=0) / (i - left + 1)
+
+
+def generate_topk_label(k: int, label_map: Dict, result: np.ndarray):
+    """
+    Return top k result.
+    """
+    result = np.asarray(result)
+    topk_idx = (-result).argsort()[:k]
+
+    ret = ''
+    for idx in topk_idx:
+        label, score = label_map[idx], result[idx]
+        ret += f'{label}: {score}\n'
+    return ret
+
+
+if __name__ == "__main__":
+    label_map = {}
+    with open(args.label_file, 'r') as f:
+        for i, l in enumerate(f.readlines()):
+            label_map[i] = l.strip()
+
+    results = np.load(args.tagging_file, allow_pickle=True)
+    times, scores = results['time'], results['scores']
+
+    if args.smooth:
+        logger.info('Posterior smoothing...')
+        smooth(scores, win_size=args.smooth_size)
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    output_file = os.path.join(
+        args.output_dir,
+        os.path.basename(args.tagging_file).split('.')[0] + '.txt')
+    with open(output_file, 'w') as f:
+        for time, score in zip(times, scores):
+            f.write(f'{time}\n')
+            f.write(generate_topk_label(args.top_k, label_map, score) + '\n')
+
+    logger.info(f'Saved tagging labels to {output_file}')
--- a/paddleaudio/examples/sound_classification/README.md
+++ b/paddleaudio/examples/sound_classification/README.md
--- a/audio/examples/sound_classification/deploy/python/predict.py
+++ b/audio/examples/sound_classification/deploy/python/predict.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+from paddle import inference
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
+from scipy.special import softmax
+
+# yapf: disable
+parser = argparse.ArgumentParser()
+parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
+parser.add_argument("--batch_size", type=int, default=2, help="Batch size per GPU/CPU for training.")
+parser.add_argument('--device', choices=['cpu', 'gpu', 'xpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument('--use_tensorrt', type=eval, default=False, choices=[True, False], help='Enable to use tensorrt to speed up.')
+parser.add_argument("--precision", type=str, default="fp32", choices=["fp32", "fp16"], help='The tensorrt precision.')
+parser.add_argument('--cpu_threads', type=int, default=10, help='Number of threads to predict when using cpu.')
+parser.add_argument('--enable_mkldnn', type=eval, default=False, choices=[True, False], help='Enable to use mkldnn to speed up when using cpu.')
+parser.add_argument("--log_dir", type=str, default="./log", help="The path to save log.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def extract_features(files: str, **kwargs):
+    waveforms = []
+    srs = []
+    max_length = float('-inf')
+    for file in files:
+        waveform, sr = load_audio(file, sr=None)
+        max_length = max(max_length, len(waveform))
+        waveforms.append(waveform)
+        srs.append(sr)
+
+    feats = []
+    for i in range(len(waveforms)):
+        # padding
+        if len(waveforms[i]) < max_length:
+            pad_width = max_length - len(waveforms[i])
+            waveforms[i] = np.pad(waveforms[i], pad_width=(0, pad_width))
+
+        feat = melspectrogram(waveforms[i], sr, **kwargs).transpose()
+        feats.append(feat)
+
+    return np.stack(feats, axis=0)
+
+
+class Predictor(object):
+    def __init__(self,
+                 model_dir,
+                 device="gpu",
+                 batch_size=1,
+                 use_tensorrt=False,
+                 precision="fp32",
+                 cpu_threads=10,
+                 enable_mkldnn=False):
+        self.batch_size = batch_size
+
+        model_file = os.path.join(model_dir, "inference.pdmodel")
+        params_file = os.path.join(model_dir, "inference.pdiparams")
+
+        assert os.path.isfile(model_file) and os.path.isfile(
+            params_file), 'Please check model and parameter files.'
+
+        config = inference.Config(model_file, params_file)
+        if device == "gpu":
+            # set GPU configs accordingly
+            # such as intialize the gpu memory, enable tensorrt
+            config.enable_use_gpu(100, 0)
+            precision_map = {
+                "fp16": inference.PrecisionType.Half,
+                "fp32": inference.PrecisionType.Float32,
+            }
+            precision_mode = precision_map[precision]
+
+            if use_tensorrt:
+                config.enable_tensorrt_engine(
+                    max_batch_size=batch_size,
+                    min_subgraph_size=30,
+                    precision_mode=precision_mode)
+        elif device == "cpu":
+            # set CPU configs accordingly,
+            # such as enable_mkldnn, set_cpu_math_library_num_threads
+            config.disable_gpu()
+            if enable_mkldnn:
+                # cache 10 different shapes for mkldnn to avoid memory leak
+                config.set_mkldnn_cache_capacity(10)
+                config.enable_mkldnn()
+            config.set_cpu_math_library_num_threads(cpu_threads)
+        elif device == "xpu":
+            # set XPU configs accordingly
+            config.enable_xpu(100)
+
+        config.switch_use_feed_fetch_ops(False)
+        self.predictor = inference.create_predictor(config)
+        self.input_handles = [
+            self.predictor.get_input_handle(name)
+            for name in self.predictor.get_input_names()
+        ]
+        self.output_handle = self.predictor.get_output_handle(
+            self.predictor.get_output_names()[0])
+
+    def predict(self, wavs):
+        feats = extract_features(wavs)
+
+        self.input_handles[0].copy_from_cpu(feats)
+        self.predictor.run()
+        logits = self.output_handle.copy_to_cpu()
+        probs = softmax(logits, axis=1)
+        indices = np.argmax(probs, axis=1)
+
+        return indices
+
+
+if __name__ == "__main__":
+    # Define predictor to do prediction.
+    predictor = Predictor(args.model_dir, args.device, args.batch_size,
+                          args.use_tensorrt, args.precision, args.cpu_threads,
+                          args.enable_mkldnn)
+
+    wavs = [
+        '~/audio_demo_resource/cat.wav',
+        '~/audio_demo_resource/dog.wav',
+    ]
+
+    for i in range(len(wavs)):
+        wavs[i] = os.path.abspath(os.path.expanduser(wavs[i]))
+        assert os.path.isfile(
+            wavs[i]), f'Please check input wave file: {wavs[i]}'
+
+    results = predictor.predict(wavs)
+    for idx, wav in enumerate(wavs):
+        print(f'Wav: {wav} \t Label: {ESC50.label_list[results[idx]]}')
--- a/audio/examples/sound_classification/export_model.py
+++ b/audio/examples/sound_classification/export_model.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+from model import SoundClassifier
+from paddleaudio.datasets import ESC50
+from paddleaudio.models.panns import cnn14
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
+parser.add_argument("--output_dir", type=str, default='./export', help="Path to save static model and its parameters.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == '__main__':
+    model = SoundClassifier(
+        backbone=cnn14(pretrained=False, extract_embedding=True),
+        num_class=len(ESC50.label_list))
+    model.set_state_dict(paddle.load(args.checkpoint))
+    model.eval()
+
+    model = paddle.jit.to_static(
+        model,
+        input_spec=[
+            paddle.static.InputSpec(
+                shape=[None, None, 64], dtype=paddle.float32)
+        ])
+
+    # Save in static graph model.
+    paddle.jit.save(model, os.path.join(args.output_dir, "inference"))
--- a/paddleaudio/examples/sound_classification/model.py
+++ b/paddleaudio/examples/sound_classification/model.py
--- a/audio/examples/sound_classification/predict.py
+++ b/audio/examples/sound_classification/predict.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import numpy as np
+import paddle
+import paddle.nn.functional as F
+from model import SoundClassifier
+from paddleaudio.backends import load as load_audio
+from paddleaudio.datasets import ESC50
+from paddleaudio.features import melspectrogram
+from paddleaudio.models.panns import cnn14
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.")
+parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.")
+parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results")
+parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.")
+args = parser.parse_args()
+# yapf: enable
+
+
+def extract_features(file: str, **kwargs):
+    waveform, sr = load_audio(file, sr=None)
+    feat = melspectrogram(waveform, sr, **kwargs).transpose()
+    return feat
+
+
+if __name__ == '__main__':
+    paddle.set_device(args.device)
+
+    model = SoundClassifier(
+        backbone=cnn14(pretrained=False, extract_embedding=True),
+        num_class=len(ESC50.label_list))
+    model.set_state_dict(paddle.load(args.checkpoint))
+    model.eval()
+
+    feat = np.expand_dims(extract_features(args.wav), 0)
+    feat = paddle.to_tensor(feat)
+    logits = model(feat)
+    probs = F.softmax(logits, axis=1).numpy()
+
+    sorted_indices = (-probs[0]).argsort()
+
+    msg = f'[{args.wav}]\n'
+    for idx in sorted_indices[:args.top_k]:
+        msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n'
+    print(msg)
--- a/audio/examples/sound_classification/train.py
+++ b/audio/examples/sound_classification/train.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import paddle
+from model import SoundClassifier
+from paddleaudio.datasets import ESC50
+from paddleaudio.models.panns import cnn14
+from paddleaudio.utils import logger
+from paddleaudio.utils import Timer
+
+# yapf: disable
+parser = argparse.ArgumentParser(__doc__)
+parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to train model, defaults to gpu.")
+parser.add_argument("--epochs", type=int, default=50, help="Number of epoches for fine-tuning.")
+parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate used to train with warmup.")
+parser.add_argument("--batch_size", type=int, default=16, help="Total examples' number in batch for training.")
+parser.add_argument("--num_workers", type=int, default=0, help="Number of workers in dataloader.")
+parser.add_argument("--checkpoint_dir", type=str, default='./checkpoint', help="Directory to save model checkpoints.")
+parser.add_argument("--save_freq", type=int, default=10, help="Save checkpoint every n epoch.")
+parser.add_argument("--log_freq", type=int, default=10, help="Log the training infomation every n steps.")
+args = parser.parse_args()
+# yapf: enable
+
+if __name__ == "__main__":
+    paddle.set_device(args.device)
+    nranks = paddle.distributed.get_world_size()
+    if paddle.distributed.get_world_size() > 1:
+        paddle.distributed.init_parallel_env()
+    local_rank = paddle.distributed.get_rank()
+
+    backbone = cnn14(pretrained=True, extract_embedding=True)
+    model = SoundClassifier(backbone, num_class=len(ESC50.label_list))
+    model = paddle.DataParallel(model)
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=args.learning_rate, parameters=model.parameters())
+    criterion = paddle.nn.loss.CrossEntropyLoss()
+
+    train_ds = ESC50(mode='train', feat_type='melspectrogram')
+    dev_ds = ESC50(mode='dev', feat_type='melspectrogram')
+
+    train_sampler = paddle.io.DistributedBatchSampler(
+        train_ds, batch_size=args.batch_size, shuffle=True, drop_last=False)
+    train_loader = paddle.io.DataLoader(
+        train_ds,
+        batch_sampler=train_sampler,
+        num_workers=args.num_workers,
+        return_list=True,
+        use_buffer_reader=True, )
+
+    steps_per_epoch = len(train_sampler)
+    timer = Timer(steps_per_epoch * args.epochs)
+    timer.start()
+
+    for epoch in range(1, args.epochs + 1):
+        model.train()
+
+        avg_loss = 0
+        num_corrects = 0
+        num_samples = 0
+        for batch_idx, batch in enumerate(train_loader):
+            feats, labels = batch
+            logits = model(feats)
+
+            loss = criterion(logits, labels)
+            loss.backward()
+            optimizer.step()
+            if isinstance(optimizer._learning_rate,
+                          paddle.optimizer.lr.LRScheduler):
+                optimizer._learning_rate.step()
+            optimizer.clear_grad()
+
+            # Calculate loss
+            avg_loss += loss.numpy()[0]
+
+            # Calculate metrics
+            preds = paddle.argmax(logits, axis=1)
+            num_corrects += (preds == labels).numpy().sum()
+            num_samples += feats.shape[0]
+
+            timer.count()
+
+            if (batch_idx + 1) % args.log_freq == 0 and local_rank == 0:
+                lr = optimizer.get_lr()
+                avg_loss /= args.log_freq
+                avg_acc = num_corrects / num_samples
+
+                print_msg = 'Epoch={}/{}, Step={}/{}'.format(
+                    epoch, args.epochs, batch_idx + 1, steps_per_epoch)
+                print_msg += ' loss={:.4f}'.format(avg_loss)
+                print_msg += ' acc={:.4f}'.format(avg_acc)
+                print_msg += ' lr={:.6f} step/sec={:.2f} | ETA {}'.format(
+                    lr, timer.timing, timer.eta)
+                logger.train(print_msg)
+
+                avg_loss = 0
+                num_corrects = 0
+                num_samples = 0
+
+        if epoch % args.save_freq == 0 and batch_idx + 1 == steps_per_epoch and local_rank == 0:
+            dev_sampler = paddle.io.BatchSampler(
+                dev_ds,
+                batch_size=args.batch_size,
+                shuffle=False,
+                drop_last=False)
+            dev_loader = paddle.io.DataLoader(
+                dev_ds,
+                batch_sampler=dev_sampler,
+                num_workers=args.num_workers,
+                return_list=True, )
+
+            model.eval()
+            num_corrects = 0
+            num_samples = 0
+            with logger.processing('Evaluation on validation dataset'):
+                for batch_idx, batch in enumerate(dev_loader):
+                    feats, labels = batch
+                    logits = model(feats)
+
+                    preds = paddle.argmax(logits, axis=1)
+                    num_corrects += (preds == labels).numpy().sum()
+                    num_samples += feats.shape[0]
+
+            print_msg = '[Evaluation result]'
+            print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+
+            logger.eval(print_msg)
+
+            # Save model
+            save_dir = os.path.join(args.checkpoint_dir,
+                                    'epoch_{}'.format(epoch))
+            logger.info('Saving model checkpoint to {}'.format(save_dir))
+            paddle.save(model.state_dict(),
+                        os.path.join(save_dir, 'model.pdparams'))
+            paddle.save(optimizer.state_dict(),
+                        os.path.join(save_dir, 'model.pdopt'))
--- a/paddleaudio/paddleaudio/__init__.py
+++ b/paddleaudio/paddleaudio/__init__.py
--- a/paddleaudio/paddleaudio/backends/__init__.py
+++ b/paddleaudio/paddleaudio/backends/__init__.py
--- a/paddleaudio/paddleaudio/backends/audio.py
+++ b/paddleaudio/paddleaudio/backends/audio.py
--- a/paddleaudio/paddleaudio/datasets/__init__.py
+++ b/paddleaudio/paddleaudio/datasets/__init__.py
--- a/paddleaudio/paddleaudio/datasets/aishell.py
+++ b/paddleaudio/paddleaudio/datasets/aishell.py
--- a/paddleaudio/paddleaudio/datasets/dataset.py
+++ b/paddleaudio/paddleaudio/datasets/dataset.py
--- a/paddleaudio/paddleaudio/datasets/dcase.py
+++ b/paddleaudio/paddleaudio/datasets/dcase.py
--- a/paddleaudio/paddleaudio/datasets/esc50.py
+++ b/paddleaudio/paddleaudio/datasets/esc50.py
--- a/paddleaudio/paddleaudio/datasets/gtzan.py
+++ b/paddleaudio/paddleaudio/datasets/gtzan.py
--- a/paddleaudio/paddleaudio/datasets/librispeech.py
+++ b/paddleaudio/paddleaudio/datasets/librispeech.py
--- a/paddleaudio/paddleaudio/datasets/ravdess.py
+++ b/paddleaudio/paddleaudio/datasets/ravdess.py
--- a/paddleaudio/paddleaudio/datasets/tess.py
+++ b/paddleaudio/paddleaudio/datasets/tess.py
--- a/paddleaudio/paddleaudio/datasets/urban_sound.py
+++ b/paddleaudio/paddleaudio/datasets/urban_sound.py
--- a/paddleaudio/paddleaudio/features/__init__.py
+++ b/paddleaudio/paddleaudio/features/__init__.py
--- a/audio/paddleaudio/features/augment.py
+++ b/audio/paddleaudio/features/augment.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+from numpy import ndarray as array
+from paddleaudio.backends import depth_convert
+from paddleaudio.utils import ParameterError
+
+__all__ = [
+    'depth_augment',
+    'spect_augment',
+    'random_crop1d',
+    'random_crop2d',
+    'adaptive_spect_augment',
+]
+
+
+def randint(high: int) -> int:
+    """Generate one random integer in range [0 high)
+
+     This is a helper function for random data augmentaiton
+    """
+    return int(np.random.randint(0, high=high))
+
+
+def rand() -> float:
+    """Generate one floating-point number in range [0 1)
+
+    This is a helper function for random data augmentaiton
+    """
+    return float(np.random.rand(1))
+
+
+def depth_augment(y: array,
+                  choices: List=['int8', 'int16'],
+                  probs: List[float]=[0.5, 0.5]) -> array:
+    """ Audio depth augmentation
+
+    Do audio depth augmentation to simulate the distortion brought by quantization.
+    """
+    assert len(probs) == len(
+        choices
+    ), 'number of choices {} must be equal to size of probs {}'.format(
+        len(choices), len(probs))
+    depth = np.random.choice(choices, p=probs)
+    src_depth = y.dtype
+    y1 = depth_convert(y, depth)
+    y2 = depth_convert(y1, src_depth)
+
+    return y2
+
+
+def adaptive_spect_augment(spect: array, tempo_axis: int=0,
+                           level: float=0.1) -> array:
+    """Do adpative spectrogram augmentation
+
+    The level of the augmentation is gowern by the paramter level,
+    ranging from 0 to 1, with 0 represents no augmentation。
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    time_mask_width = int(nt * level * 0.5)
+    freq_mask_width = int(nf * level * 0.5)
+
+    num_time_mask = int(10 * level)
+    num_freq_mask = int(10 * level)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def spect_augment(spect: array,
+                  tempo_axis: int=0,
+                  max_time_mask: int=3,
+                  max_freq_mask: int=3,
+                  max_time_mask_width: int=30,
+                  max_freq_mask_width: int=20) -> array:
+    """Do spectrogram augmentation in both time and freq axis
+
+    Reference:
+
+    """
+    assert spect.ndim == 2., 'only supports 2d tensor or numpy array'
+    if tempo_axis == 0:
+        nt, nf = spect.shape
+    else:
+        nf, nt = spect.shape
+
+    num_time_mask = randint(max_time_mask)
+    num_freq_mask = randint(max_freq_mask)
+
+    time_mask_width = randint(max_time_mask_width)
+    freq_mask_width = randint(max_freq_mask_width)
+
+    if tempo_axis == 0:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[start:start + time_mask_width, :] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[:, start:start + freq_mask_width] = 0
+    else:
+        for _ in range(num_time_mask):
+            start = randint(nt - time_mask_width)
+            spect[:, start:start + time_mask_width] = 0
+        for _ in range(num_freq_mask):
+            start = randint(nf - freq_mask_width)
+            spect[start:start + freq_mask_width, :] = 0
+
+    return spect
+
+
+def random_crop1d(y: array, crop_len: int) -> array:
+    """ Do random cropping on 1d input signal
+
+    The input is a 1d signal, typically a sound waveform
+    """
+    if y.ndim != 1:
+        'only accept 1d tensor or numpy array'
+    n = len(y)
+    idx = randint(n - crop_len)
+    return y[idx:idx + crop_len]
+
+
+def random_crop2d(s: array, crop_len: int, tempo_axis: int=0) -> array:
+    """ Do random cropping for 2D array, typically a spectrogram.
+
+    The cropping is done in temporal direction on the time-freq input signal.
+    """
+    if tempo_axis >= s.ndim:
+        raise ParameterError('axis out of range')
+
+    n = s.shape[tempo_axis]
+    idx = randint(high=n - crop_len)
+    sli = [slice(None) for i in range(s.ndim)]
+    sli[tempo_axis] = slice(idx, idx + crop_len)
+    out = s[tuple(sli)]
+    return out
--- a/audio/paddleaudio/features/core.py
+++ b/audio/paddleaudio/features/core.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import warnings
+from typing import List
+from typing import Optional
+from typing import Union
+
+import numpy as np
+import scipy
+from numpy import ndarray as array
+from numpy.lib.stride_tricks import as_strided
+from paddleaudio.utils import ParameterError
+from scipy.signal import get_window
+
+__all__ = [
+    'stft',
+    'mfcc',
+    'hz_to_mel',
+    'mel_to_hz',
+    'split_frames',
+    'mel_frequencies',
+    'power_to_db',
+    'compute_fbank_matrix',
+    'melspectrogram',
+    'spectrogram',
+    'mu_encode',
+    'mu_decode',
+]
+
+
+def pad_center(data: array, size: int, axis: int=-1, **kwargs) -> array:
+    """Pad an array to a target length along a target axis.
+
+    This differs from `np.pad` by centering the data prior to padding,
+    analogous to `str.center`
+    """
+
+    kwargs.setdefault("mode", "constant")
+    n = data.shape[axis]
+    lpad = int((size - n) // 2)
+    lengths = [(0, 0)] * data.ndim
+    lengths[axis] = (lpad, int(size - n - lpad))
+
+    if lpad < 0:
+        raise ParameterError(("Target size ({size:d}) must be "
+                              "at least input size ({n:d})"))
+
+    return np.pad(data, lengths, **kwargs)
+
+
+def split_frames(x: array, frame_length: int, hop_length: int,
+                 axis: int=-1) -> array:
+    """Slice a data array into (overlapping) frames.
+
+    This function is aligned with librosa.frame
+    """
+
+    if not isinstance(x, np.ndarray):
+        raise ParameterError(
+            f"Input must be of type numpy.ndarray, given type(x)={type(x)}")
+
+    if x.shape[axis] < frame_length:
+        raise ParameterError(f"Input is too short (n={x.shape[axis]:d})"
+                             f" for frame_length={frame_length:d}")
+
+    if hop_length < 1:
+        raise ParameterError(f"Invalid hop_length: {hop_length:d}")
+
+    if axis == -1 and not x.flags["F_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.asfortranarray(x)
+    elif axis == 0 and not x.flags["C_CONTIGUOUS"]:
+        warnings.warn(f"librosa.util.frame called with axis={axis} "
+                      "on a non-contiguous input. This will result in a copy.")
+        x = np.ascontiguousarray(x)
+
+    n_frames = 1 + (x.shape[axis] - frame_length) // hop_length
+    strides = np.asarray(x.strides)
+
+    new_stride = np.prod(strides[strides > 0] // x.itemsize) * x.itemsize
+
+    if axis == -1:
+        shape = list(x.shape)[:-1] + [frame_length, n_frames]
+        strides = list(strides) + [hop_length * new_stride]
+
+    elif axis == 0:
+        shape = [n_frames, frame_length] + list(x.shape)[1:]
+        strides = [hop_length * new_stride] + list(strides)
+
+    else:
+        raise ParameterError(f"Frame axis={axis} must be either 0 or -1")
+
+    return as_strided(x, shape=shape, strides=strides)
+
+
+def _check_audio(y, mono=True) -> bool:
+    """Determine whether a variable contains valid audio data.
+
+    The audio y must be a np.ndarray, ether 1-channel or two channel
+    """
+    if not isinstance(y, np.ndarray):
+        raise ParameterError("Audio data must be of type numpy.ndarray")
+    if y.ndim > 2:
+        raise ParameterError(
+            f"Invalid shape for audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if mono and y.ndim == 2:
+        raise ParameterError(
+            f"Invalid shape for mono audio ndim={y.ndim:d}, shape={y.shape}")
+
+    if (mono and len(y) == 0) or (not mono and y.shape[1] < 0):
+        raise ParameterError(f"Audio is empty ndim={y.ndim:d}, shape={y.shape}")
+
+    if not np.issubdtype(y.dtype, np.floating):
+        raise ParameterError("Audio data must be floating-point")
+
+    if not np.isfinite(y).all():
+        raise ParameterError("Audio buffer is not finite everywhere")
+
+    return True
+
+
+def hz_to_mel(frequencies: Union[float, List[float], array],
+              htk: bool=False) -> array:
+    """Convert Hz to Mels
+
+    This function is aligned with librosa.
+    """
+    freq = np.asanyarray(frequencies)
+
+    if htk:
+        return 2595.0 * np.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if freq.ndim:
+        # If we have array data, vectorize
+        log_t = freq >= min_log_hz
+        mels[log_t] = min_log_mel + \
+            np.log(freq[log_t] / min_log_hz) / logstep
+    elif freq >= min_log_hz:
+        # If we have scalar data, heck directly
+        mels = min_log_mel + np.log(freq / min_log_hz) / logstep
+
+    return mels
+
+
+def mel_to_hz(mels: Union[float, List[float], array], htk: int=False) -> array:
+    """Convert mel bin numbers to frequencies.
+
+    This function is aligned with librosa.
+    """
+    mel_array = np.asanyarray(mels)
+
+    if htk:
+        return 700.0 * (10.0**(mel_array / 2595.0) - 1.0)
+
+    # Fill in the linear scale
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel_array
+
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = np.log(6.4) / 27.0  # step size for log region
+
+    if mel_array.ndim:
+        # If we have vector data, vectorize
+        log_t = mel_array >= min_log_mel
+        freqs[log_t] = min_log_hz * \
+            np.exp(logstep * (mel_array[log_t] - min_log_mel))
+    elif mel_array >= min_log_mel:
+        # If we have scalar data, check directly
+        freqs = min_log_hz * np.exp(logstep * (mel_array - min_log_mel))
+
+    return freqs
+
+
+def mel_frequencies(n_mels: int=128,
+                    fmin: float=0.0,
+                    fmax: float=11025.0,
+                    htk: bool=False) -> array:
+    """Compute mel frequencies
+
+    This function is aligned with librosa.
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(fmin, htk=htk)
+    max_mel = hz_to_mel(fmax, htk=htk)
+
+    mels = np.linspace(min_mel, max_mel, n_mels)
+
+    return mel_to_hz(mels, htk=htk)
+
+
+def fft_frequencies(sr: int, n_fft: int) -> array:
+    """Compute fourier frequencies.
+
+    This function is aligned with librosa.
+    """
+    return np.linspace(0, float(sr) / 2, int(1 + n_fft // 2), endpoint=True)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int=128,
+                         fmin: float=0.0,
+                         fmax: Optional[float]=None,
+                         htk: bool=False,
+                         norm: str="slaney",
+                         dtype: type=np.float32):
+    """Compute fbank matrix.
+
+    This funciton is aligned with librosa.
+    """
+    if norm != "slaney":
+        raise ParameterError('norm must set to slaney')
+
+    if fmax is None:
+        fmax = float(sr) / 2
+
+    # Initialize the weights
+    n_mels = int(n_mels)
+    weights = np.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax, htk=htk)
+
+    fdiff = np.diff(mel_f)
+    ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = np.maximum(0, np.minimum(lower, upper))
+
+    if norm == "slaney":
+        # Slaney-style mel is scaled to be approx constant energy per channel
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm[:, np.newaxis]
+
+    # Only check weights if f_mel[0] is positive
+    if not np.all((mel_f[:-2] == 0) | (weights.max(axis=1) > 0)):
+        # This means we have an empty channel somewhere
+        warnings.warn("Empty filters detected in mel frequency basis. "
+                      "Some channels will produce empty responses. "
+                      "Try increasing your sampling rate (and fmax) or "
+                      "reducing n_mels.")
+
+    return weights
+
+
+def stft(x: array,
+         n_fft: int=2048,
+         hop_length: Optional[int]=None,
+         win_length: Optional[int]=None,
+         window: str="hann",
+         center: bool=True,
+         dtype: type=np.complex64,
+         pad_mode: str="reflect") -> array:
+    """Short-time Fourier transform (STFT).
+
+    This function is aligned with librosa.
+    """
+    _check_audio(x)
+    # By default, use the entire frame
+    if win_length is None:
+        win_length = n_fft
+
+    # Set the default hop, if it's not already specified
+    if hop_length is None:
+        hop_length = int(win_length // 4)
+
+    fft_window = get_window(window, win_length, fftbins=True)
+
+    # Pad the window out to n_fft size
+    fft_window = pad_center(fft_window, n_fft)
+
+    # Reshape so that the window can be broadcast
+    fft_window = fft_window.reshape((-1, 1))
+
+    # Pad the time series so that frames are centered
+    if center:
+        if n_fft > x.shape[-1]:
+            warnings.warn(
+                f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+            )
+        x = np.pad(x, int(n_fft // 2), mode=pad_mode)
+
+    elif n_fft > x.shape[-1]:
+        raise ParameterError(
+            f"n_fft={n_fft} is too small for input signal of length={x.shape[-1]}"
+        )
+
+    # Window the time series.
+    x_frames = split_frames(x, frame_length=n_fft, hop_length=hop_length)
+    # Pre-allocate the STFT matrix
+    stft_matrix = np.empty(
+        (int(1 + n_fft // 2), x_frames.shape[1]), dtype=dtype, order="F")
+    fft = np.fft  # use numpy fft as default
+    # Constrain STFT block sizes to 256 KB
+    MAX_MEM_BLOCK = 2**8 * 2**10
+    # how many columns can we fit within MAX_MEM_BLOCK?
+    n_columns = MAX_MEM_BLOCK // (stft_matrix.shape[0] * stft_matrix.itemsize)
+    n_columns = max(n_columns, 1)
+
+    for bl_s in range(0, stft_matrix.shape[1], n_columns):
+        bl_t = min(bl_s + n_columns, stft_matrix.shape[1])
+        stft_matrix[:, bl_s:bl_t] = fft.rfft(
+            fft_window * x_frames[:, bl_s:bl_t], axis=0)
+
+    return stft_matrix
+
+
+def power_to_db(spect: array,
+                ref: float=1.0,
+                amin: float=1e-10,
+                top_db: Optional[float]=80.0) -> array:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units
+
+    This computes the scaling ``10 * log10(spect / ref)`` in a numerically
+    stable way.
+
+    This function is aligned with librosa.
+    """
+    spect = np.asarray(spect)
+
+    if amin <= 0:
+        raise ParameterError("amin must be strictly positive")
+
+    if np.issubdtype(spect.dtype, np.complexfloating):
+        warnings.warn(
+            "power_to_db was called on complex input so phase "
+            "information will be discarded. To suppress this warning, "
+            "call power_to_db(np.abs(D)**2) instead.")
+        magnitude = np.abs(spect)
+    else:
+        magnitude = spect
+
+    if callable(ref):
+        # User supplied a function to calculate reference power
+        ref_value = ref(magnitude)
+    else:
+        ref_value = np.abs(ref)
+
+    log_spec = 10.0 * np.log10(np.maximum(amin, magnitude))
+    log_spec -= 10.0 * np.log10(np.maximum(amin, ref_value))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise ParameterError("top_db must be non-negative")
+        log_spec = np.maximum(log_spec, log_spec.max() - top_db)
+
+    return log_spec
+
+
+def mfcc(x,
+         sr: int=16000,
+         spect: Optional[array]=None,
+         n_mfcc: int=20,
+         dct_type: int=2,
+         norm: str="ortho",
+         lifter: int=0,
+         **kwargs) -> array:
+    """Mel-frequency cepstral coefficients (MFCCs)
+
+    This function is NOT strictly aligned with librosa. The following example shows how to get the
+    same result with librosa:
+
+    # paddleaudioe mfcc:
+     kwargs = {
+        'window_size':512,
+        'hop_length':320,
+        'mel_bins':64,
+        'fmin':50,
+         'to_db':False}
+    a = mfcc(x,
+        spect=None,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0,
+        **kwargs)
+
+    # librosa mfcc:
+    spect = librosa.feature.melspectrogram(x,sr=16000,n_fft=512,
+                                              win_length=512,
+                                              hop_length=320,
+                                              n_mels=64, fmin=50)
+    b = librosa.feature.mfcc(x,
+        sr=16000,
+        S=spect,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0)
+
+    assert np.mean( (a-b)**2) < 1e-8
+
+    """
+    if spect is None:
+        spect = melspectrogram(x, sr=sr, **kwargs)
+
+    M = scipy.fftpack.dct(spect, axis=0, type=dct_type, norm=norm)[:n_mfcc]
+
+    if lifter > 0:
+        factor = np.sin(np.pi * np.arange(1, 1 + n_mfcc, dtype=M.dtype) /
+                        lifter)
+        return M * factor[:, np.newaxis]
+    elif lifter == 0:
+        return M
+    else:
+        raise ParameterError(
+            f"MFCC lifter={lifter} must be a non-negative number")
+
+
+def melspectrogram(x: array,
+                   sr: int=16000,
+                   window_size: int=512,
+                   hop_length: int=320,
+                   n_mels: int=64,
+                   fmin: int=50,
+                   fmax: Optional[float]=None,
+                   window: str='hann',
+                   center: bool=True,
+                   pad_mode: str='reflect',
+                   power: float=2.0,
+                   to_db: bool=True,
+                   ref: float=1.0,
+                   amin: float=1e-10,
+                   top_db: Optional[float]=None) -> array:
+    """Compute mel-spectrogram.
+
+    Parameters:
+        x: numpy.ndarray
+        The input wavform is a numpy array [shape=(n,)]
+
+        window_size: int, typically 512, 1024, 2048, etc.
+        The window size for framing, also used as n_fft for stft
+
+
+    Returns:
+        The mel-spectrogram in power scale or db scale(default)
+
+
+    Notes:
+    1. sr is default to 16000, which is commonly used in speech/speaker processing.
+    2. when fmax is None, it is set to sr//2.
+    3. this function will convert mel spectgrum to db scale by default. This is different
+    that of librosa.
+
+    """
+    _check_audio(x, mono=True)
+    if len(x) <= 0:
+        raise ParameterError('The input waveform is empty')
+
+    if fmax is None:
+        fmax = sr // 2
+    if fmin < 0 or fmin >= fmax:
+        raise ParameterError('fmin and fmax must statisfy 0<fmin<fmax')
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    spect_power = np.abs(s)**power
+    fb_matrix = compute_fbank_matrix(
+        sr=sr, n_fft=window_size, n_mels=n_mels, fmin=fmin, fmax=fmax)
+    mel_spect = np.matmul(fb_matrix, spect_power)
+    if to_db:
+        return power_to_db(mel_spect, ref=ref, amin=amin, top_db=top_db)
+    else:
+        return mel_spect
+
+
+def spectrogram(x: array,
+                sr: int=16000,
+                window_size: int=512,
+                hop_length: int=320,
+                window: str='hann',
+                center: bool=True,
+                pad_mode: str='reflect',
+                power: float=2.0) -> array:
+    """Compute spectrogram from an input waveform.
+
+    This function is a wrapper for librosa.feature.stft, with addition step to
+    compute the magnitude of the complex spectrogram.
+    """
+
+    s = stft(
+        x,
+        n_fft=window_size,
+        hop_length=hop_length,
+        win_length=window_size,
+        window=window,
+        center=center,
+        pad_mode=pad_mode)
+
+    return np.abs(s)**power
+
+
+def mu_encode(x: array, mu: int=255, quantized: bool=True) -> array:
+    """Mu-law encoding.
+
+    Compute the mu-law decoding given an input code.
+    When quantized is True, the result will be converted to
+    integer in range [0,mu-1]. Otherwise, the resulting signal
+    is in range [-1,1]
+
+
+    Reference:
+        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+
+    """
+    mu = 255
+    y = np.sign(x) * np.log1p(mu * np.abs(x)) / np.log1p(mu)
+    if quantized:
+        y = np.floor((y + 1) / 2 * mu + 0.5)  # convert to [0 , mu-1]
+    return y
+
+
+def mu_decode(y: array, mu: int=255, quantized: bool=True) -> array:
+    """Mu-law decoding.
+
+    Compute the mu-law decoding given an input code.
+
+    it assumes that the input y is in
+    range [0,mu-1] when quantize is True and [-1,1] otherwise
+
+    Reference:
+        https://en.wikipedia.org/wiki/%CE%9C-law_algorithm
+
+    """
+    if mu < 1:
+        raise ParameterError('mu is typically set as 2**k-1, k=1, 2, 3,...')
+
+    mu = mu - 1
+    if quantized:  # undo the quantization
+        y = y * 2 / mu - 1
+    x = np.sign(y) / mu * ((1 + mu)**np.abs(y) - 1)
+    return x
--- a/deepspeech/decoders/ctcdecoder/__init__.py
+++ b/deepspeech/decoders/ctcdecoder/__init__.py
--- a/paddleaudio/paddleaudio/models/panns.py
+++ b/paddleaudio/paddleaudio/models/panns.py
--- a/paddleaudio/paddleaudio/utils/__init__.py
+++ b/paddleaudio/paddleaudio/utils/__init__.py
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
--- a/paddleaudio/paddleaudio/utils/env.py
+++ b/paddleaudio/paddleaudio/utils/env.py
--- a/paddleaudio/paddleaudio/utils/error.py
+++ b/paddleaudio/paddleaudio/utils/error.py
--- a/paddleaudio/paddleaudio/utils/log.py
+++ b/paddleaudio/paddleaudio/utils/log.py
--- a/paddleaudio/paddleaudio/utils/time.py
+++ b/paddleaudio/paddleaudio/utils/time.py
--- a/audio/setup.py
+++ b/audio/setup.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import setuptools
+
+# set the version here
+version = '0.1.0a'
+
+with open("README.md", "r") as fh:
+    long_description = fh.read()
+
+setuptools.setup(
+    name="paddleaudio",
+    version=version,
+    author="",
+    author_email="",
+    description="PaddleAudio, in development",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(exclude=["build*", "test*", "examples*"]),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.6',
+    install_requires=[
+        'numpy >= 1.15.0',
+        'scipy >= 1.0.0',
+        'resampy >= 0.2.2',
+        'soundfile >= 0.9.0',
+        'colorlog',
+        'pathos',
+    ],
+    extras_require={'dev': ['pytest>=3.7', 'librosa>=0.7.2']
+                    }  # for dev only, install: pip install -e .[dev]
+)
--- a/paddleaudio/test/README.md
+++ b/paddleaudio/test/README.md
--- a/audio/test/unit_test/test_backend.py
+++ b/audio/test/unit_test/test_backend.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa
+import numpy as np
+import paddleaudio
+import pytest
+
+TEST_FILE = './test/data/test_audio.wav'
+
+
+def relative_err(a, b, real=True):
+    """compute relative error of two matrices or vectors"""
+    if real:
+        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
+    else:
+        err = np.sum((a.real - b.real)**2) / \
+            (EPS + np.sum(a.real**2) + np.sum(b.real**2))
+        err += np.sum((a.imag - b.imag)**2) / \
+            (EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
+
+        return err
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def load_audio():
+    x, r = librosa.load(TEST_FILE, sr=16000)
+    print(f'librosa: mean: {np.mean(x)}, std:{np.std(x)}')
+    return x, r
+
+
+# start testing
+x, r = load_audio()
+EPS = 1e-8
+
+
+def test_load():
+    s, r = paddleaudio.load(TEST_FILE, sr=16000)
+    assert r == 16000
+    assert s.dtype == 'float32'
+
+    s, r = paddleaudio.load(
+        TEST_FILE, sr=16000, offset=1, duration=2, dtype='int16')
+    assert len(s) / r == 2.0
+    assert r == 16000
+    assert s.dtype == 'int16'
+
+
+def test_depth_convert():
+    y = paddleaudio.depth_convert(x, 'int16')
+    assert len(y) == len(x)
+    assert y.dtype == 'int16'
+    assert np.max(y) <= 32767
+    assert np.min(y) >= -32768
+    assert np.std(y) > EPS
+
+    y = paddleaudio.depth_convert(x, 'int8')
+    assert len(y) == len(x)
+    assert y.dtype == 'int8'
+    assert np.max(y) <= 127
+    assert np.min(y) >= -128
+    assert np.std(y) > EPS
+
+
+# test case for resample
+rs_test_data = [
+    (32000, 'kaiser_fast'),
+    (16000, 'kaiser_fast'),
+    (8000, 'kaiser_fast'),
+    (32000, 'kaiser_best'),
+    (16000, 'kaiser_best'),
+    (8000, 'kaiser_best'),
+    (22050, 'kaiser_best'),
+    (44100, 'kaiser_best'),
+]
+
+
+@pytest.mark.parametrize('sr,mode', rs_test_data)
+def test_resample(sr, mode):
+    y = paddleaudio.resample(x, 16000, sr, mode=mode)
+    factor = sr / 16000
+    err = relative_err(len(y), len(x) * factor)
+    print('err:', err)
+    assert err < EPS
+
+
+def test_normalize():
+    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=0.5)
+    assert np.max(y) < 0.5 + EPS
+
+    y = paddleaudio.normalize(x, norm_type='linear', mul_factor=2.0)
+    assert np.max(y) <= 2.0 + EPS
+
+    y = paddleaudio.normalize(x, norm_type='gaussian', mul_factor=1.0)
+    print('np.std(y):', np.std(y))
+    assert np.abs(np.std(y) - 1.0) < EPS
+
+
+if __name__ == '__main__':
+    test_load()
+    test_depth_convert()
+    test_resample(22050, 'kaiser_fast')
+    test_normalize()
--- a/audio/test/unit_test/test_features.py
+++ b/audio/test/unit_test/test_features.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import librosa
+import numpy as np
+import paddleaudio as pa
+import pytest
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def load_audio():
+    x, r = librosa.load('./test/data/test_audio.wav')
+    #x,r = librosa.load('../data/test_audio.wav',sr=16000)
+    return x, r
+
+
+## start testing
+x, r = load_audio()
+EPS = 1e-8
+
+
+def relative_err(a, b, real=True):
+    """compute relative error of two matrices or vectors"""
+    if real:
+        return np.sum((a - b)**2) / (EPS + np.sum(a**2) + np.sum(b**2))
+    else:
+        err = np.sum((a.real - b.real)**2) / (
+            EPS + np.sum(a.real**2) + np.sum(b.real**2))
+        err += np.sum((a.imag - b.imag)**2) / (
+            EPS + np.sum(a.imag**2) + np.sum(b.imag**2))
+
+        return err
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_melspectrogram():
+    a = pa.melspectrogram(
+        x,
+        window_size=512,
+        sr=16000,
+        hop_length=320,
+        n_mels=64,
+        fmin=50,
+        to_db=False, )
+    b = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_melspectrogram_db():
+
+    a = pa.melspectrogram(
+        x,
+        window_size=512,
+        sr=16000,
+        hop_length=320,
+        n_mels=64,
+        fmin=50,
+        to_db=True,
+        ref=1.0,
+        amin=1e-10,
+        top_db=None)
+    b = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    b = pa.power_to_db(b, ref=1.0, amin=1e-10, top_db=None)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_stft():
+    a = pa.stft(x, n_fft=1024, hop_length=320, win_length=512)
+    b = librosa.stft(x, n_fft=1024, hop_length=320, win_length=512)
+    assert a.shape == b.shape
+    assert relative_err(a, b, real=False) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_split_frames():
+    a = librosa.util.frame(x, frame_length=512, hop_length=320)
+    b = pa.split_frames(x, frame_length=512, hop_length=320)
+    assert relative_err(a, b) < EPS
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_mfcc():
+    kwargs = {
+        'window_size': 512,
+        'hop_length': 320,
+        'n_mels': 64,
+        'fmin': 50,
+        'to_db': False
+    }
+    a = pa.mfcc(
+        x,
+        #sample_rate=16000,
+        spect=None,
+        n_mfcc=20,
+        dct_type=2,
+        norm='ortho',
+        lifter=0,
+        **kwargs)
+    S = librosa.feature.melspectrogram(
+        x,
+        sr=16000,
+        n_fft=512,
+        win_length=512,
+        hop_length=320,
+        n_mels=64,
+        fmin=50)
+    b = librosa.feature.mfcc(
+        x, sr=16000, S=S, n_mfcc=20, dct_type=2, norm='ortho', lifter=0)
+    assert relative_err(a, b) < EPS
+
+
+if __name__ == '__main__':
+    test_melspectrogram()
+    test_melspectrogram_db()
+    test_stft()
+    test_split_frames()
+    test_mfcc()
--- a/deepspeech/__init__.py
+++ b/deepspeech/__init__.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import Any
-from typing import List
-from typing import Tuple
-from typing import Union
-
-import paddle
-from paddle import nn
-from paddle.fluid import core
-from paddle.nn import functional as F
-
-from deepspeech.utils.log import Log
-
-#TODO(Hui Zhang): remove  fluid import
-logger = Log(__name__).getlog()
-
-########### hcak logging #############
-logger.warn = logger.warning
-
-########### hcak paddle #############
-paddle.half = 'float16'
-paddle.float = 'float32'
-paddle.double = 'float64'
-paddle.short = 'int16'
-paddle.int = 'int32'
-paddle.long = 'int64'
-paddle.uint16 = 'uint16'
-paddle.cdouble = 'complex128'
-
-
-def convert_dtype_to_string(tensor_dtype):
-    """
-    Convert the data type in numpy to the data type in Paddle
-    Args:
-        tensor_dtype(core.VarDesc.VarType): the data type in numpy.
-    Returns:
-        core.VarDesc.VarType: the data type in Paddle.
-    """
-    dtype = tensor_dtype
-    if dtype == core.VarDesc.VarType.FP32:
-        return paddle.float32
-    elif dtype == core.VarDesc.VarType.FP64:
-        return paddle.float64
-    elif dtype == core.VarDesc.VarType.FP16:
-        return paddle.float16
-    elif dtype == core.VarDesc.VarType.INT32:
-        return paddle.int32
-    elif dtype == core.VarDesc.VarType.INT16:
-        return paddle.int16
-    elif dtype == core.VarDesc.VarType.INT64:
-        return paddle.int64
-    elif dtype == core.VarDesc.VarType.BOOL:
-        return paddle.bool
-    elif dtype == core.VarDesc.VarType.BF16:
-        # since there is still no support for bfloat16 in NumPy,
-        # uint16 is used for casting bfloat16
-        return paddle.uint16
-    elif dtype == core.VarDesc.VarType.UINT8:
-        return paddle.uint8
-    elif dtype == core.VarDesc.VarType.INT8:
-        return paddle.int8
-    elif dtype == core.VarDesc.VarType.COMPLEX64:
-        return paddle.complex64
-    elif dtype == core.VarDesc.VarType.COMPLEX128:
-        return paddle.complex128
-    else:
-        raise ValueError("Not supported tensor dtype %s" % dtype)
-
-
-if not hasattr(paddle, 'softmax'):
-    logger.debug("register user softmax to paddle, remove this when fixed!")
-    setattr(paddle, 'softmax', paddle.nn.functional.softmax)
-
-if not hasattr(paddle, 'log_softmax'):
-    logger.debug("register user log_softmax to paddle, remove this when fixed!")
-    setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax)
-
-if not hasattr(paddle, 'sigmoid'):
-    logger.debug("register user sigmoid to paddle, remove this when fixed!")
-    setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid)
-
-if not hasattr(paddle, 'log_sigmoid'):
-    logger.debug("register user log_sigmoid to paddle, remove this when fixed!")
-    setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid)
-
-if not hasattr(paddle, 'relu'):
-    logger.debug("register user relu to paddle, remove this when fixed!")
-    setattr(paddle, 'relu', paddle.nn.functional.relu)
-
-
-def cat(xs, dim=0):
-    return paddle.concat(xs, axis=dim)
-
-
-if not hasattr(paddle, 'cat'):
-    logger.debug(
-        "override cat of paddle if exists or register, remove this when fixed!")
-    paddle.cat = cat
-
-
-########### hcak paddle.Tensor #############
-def item(x: paddle.Tensor):
-    return x.numpy().item()
-
-
-if not hasattr(paddle.Tensor, 'item'):
-    logger.debug(
-        "override item of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.item = item
-
-
-def func_long(x: paddle.Tensor):
-    return paddle.cast(x, paddle.long)
-
-
-if not hasattr(paddle.Tensor, 'long'):
-    logger.debug(
-        "override long of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.long = func_long
-
-if not hasattr(paddle.Tensor, 'numel'):
-    logger.debug(
-        "override numel of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.numel = paddle.numel
-
-
-def new_full(x: paddle.Tensor,
-             size: Union[List[int], Tuple[int], paddle.Tensor],
-             fill_value: Union[float, int, bool, paddle.Tensor],
-             dtype=None):
-    return paddle.full(size, fill_value, dtype=x.dtype)
-
-
-if not hasattr(paddle.Tensor, 'new_full'):
-    logger.debug(
-        "override new_full of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.new_full = new_full
-
-
-def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor:
-    if convert_dtype_to_string(xs.dtype) == paddle.bool:
-        xs = xs.astype(paddle.int)
-    return xs.equal(
-        paddle.to_tensor(
-            ys, dtype=convert_dtype_to_string(xs.dtype), place=xs.place))
-
-
-if not hasattr(paddle.Tensor, 'eq'):
-    logger.debug(
-        "override eq of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.eq = eq
-
-if not hasattr(paddle, 'eq'):
-    logger.debug(
-        "override eq of paddle if exists or register, remove this when fixed!")
-    paddle.eq = eq
-
-
-def contiguous(xs: paddle.Tensor) -> paddle.Tensor:
-    return xs
-
-
-if not hasattr(paddle.Tensor, 'contiguous'):
-    logger.debug(
-        "override contiguous of paddle.Tensor if exists or register, remove this when fixed!"
-    )
-    paddle.Tensor.contiguous = contiguous
-
-
-def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
-    nargs = len(args)
-    assert (nargs <= 1)
-    s = paddle.shape(xs)
-    if nargs == 1:
-        return s[args[0]]
-    else:
-        return s
-
-
-#`to_static` do not process `size` property, maybe some `paddle` api dependent on it.
-logger.debug(
-    "override size of paddle.Tensor "
-    "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!"
-)
-paddle.Tensor.size = size
-
-
-def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor:
-    return xs.reshape(args)
-
-
-if not hasattr(paddle.Tensor, 'view'):
-    logger.debug("register user view to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.view = view
-
-
-def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor:
-    return xs.reshape(ys.size())
-
-
-if not hasattr(paddle.Tensor, 'view_as'):
-    logger.debug(
-        "register user view_as to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.view_as = view_as
-
-
-def is_broadcastable(shp1, shp2):
-    for a, b in zip(shp1[::-1], shp2[::-1]):
-        if a == 1 or b == 1 or a == b:
-            pass
-        else:
-            return False
-    return True
-
-
-def masked_fill(xs: paddle.Tensor,
-                mask: paddle.Tensor,
-                value: Union[float, int]):
-    assert is_broadcastable(xs.shape, mask.shape) is True, (xs.shape,
-                                                            mask.shape)
-    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
-    mask = mask.broadcast_to(bshape)
-    trues = paddle.ones_like(xs) * value
-    xs = paddle.where(mask, trues, xs)
-    return xs
-
-
-if not hasattr(paddle.Tensor, 'masked_fill'):
-    logger.debug(
-        "register user masked_fill to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.masked_fill = masked_fill
-
-
-def masked_fill_(xs: paddle.Tensor,
-                 mask: paddle.Tensor,
-                 value: Union[float, int]) -> paddle.Tensor:
-    assert is_broadcastable(xs.shape, mask.shape) is True
-    bshape = paddle.broadcast_shape(xs.shape, mask.shape)
-    mask = mask.broadcast_to(bshape)
-    trues = paddle.ones_like(xs) * value
-    ret = paddle.where(mask, trues, xs)
-    paddle.assign(ret.detach(), output=xs)
-    return xs
-
-
-if not hasattr(paddle.Tensor, 'masked_fill_'):
-    logger.debug(
-        "register user masked_fill_ to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.masked_fill_ = masked_fill_
-
-
-def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor:
-    val = paddle.full_like(xs, value)
-    paddle.assign(val.detach(), output=xs)
-    return xs
-
-
-if not hasattr(paddle.Tensor, 'fill_'):
-    logger.debug(
-        "register user fill_ to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.fill_ = fill_
-
-
-def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor:
-    return paddle.tile(xs, size)
-
-
-if not hasattr(paddle.Tensor, 'repeat'):
-    logger.debug(
-        "register user repeat to paddle.Tensor, remove this when fixed!")
-    paddle.Tensor.repeat = repeat
-
-if not hasattr(paddle.Tensor, 'softmax'):
-    logger.debug(
-        "register user softmax to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax)
-
-if not hasattr(paddle.Tensor, 'sigmoid'):
-    logger.debug(
-        "register user sigmoid to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid)
-
-if not hasattr(paddle.Tensor, 'relu'):
-    logger.debug("register user relu to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu)
-
-
-def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor:
-    return x.astype(other.dtype)
-
-
-if not hasattr(paddle.Tensor, 'type_as'):
-    logger.debug(
-        "register user type_as to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'type_as', type_as)
-
-
-def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
-    assert len(args) == 1
-    if isinstance(args[0], str):  # dtype
-        return x.astype(args[0])
-    elif isinstance(args[0], paddle.Tensor):  # Tensor
-        return x.astype(args[0].dtype)
-    else:  # Device
-        return x
-
-
-if not hasattr(paddle.Tensor, 'to'):
-    logger.debug("register user to to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'to', to)
-
-
-def func_float(x: paddle.Tensor) -> paddle.Tensor:
-    return x.astype(paddle.float)
-
-
-if not hasattr(paddle.Tensor, 'float'):
-    logger.debug(
-        "register user float to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'float', func_float)
-
-
-def func_int(x: paddle.Tensor) -> paddle.Tensor:
-    return x.astype(paddle.int)
-
-
-if not hasattr(paddle.Tensor, 'int'):
-    logger.debug("register user int to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'int', func_int)
-
-
-def tolist(x: paddle.Tensor) -> List[Any]:
-    return x.numpy().tolist()
-
-
-if not hasattr(paddle.Tensor, 'tolist'):
-    logger.debug(
-        "register user tolist to paddle.Tensor, remove this when fixed!")
-    setattr(paddle.Tensor, 'tolist', tolist)
-
-########### hack paddle.nn #############
-from paddle.nn import Layer
-from typing import Optional
-from typing import Mapping
-from typing import Iterable
-from typing import Tuple
-from typing import Iterator
-from collections import OrderedDict, abc as container_abcs
-
-
-class LayerDict(paddle.nn.Layer):
-    r"""Holds submodules in a dictionary.
-
-    :class:`~paddle.nn.LayerDict` can be indexed like a regular Python dictionary,
-    but modules it contains are properly registered, and will be visible by all
-    :class:`~paddle.nn.Layer` methods.
-
-    :class:`~paddle.nn.LayerDict` is an **ordered** dictionary that respects
-
-    * the order of insertion, and
-
-    * in :meth:`~paddle.nn.LayerDict.update`, the order of the merged
-      ``OrderedDict``, ``dict`` (started from Python 3.6) or another
-      :class:`~paddle.nn.LayerDict` (the argument to
-      :meth:`~paddle.nn.LayerDict.update`).
-
-    Note that :meth:`~paddle.nn.LayerDict.update` with other unordered mapping
-    types (e.g., Python's plain ``dict`` before Python version 3.6) does not
-    preserve the order of the merged mapping.
-
-    Args:
-        modules (iterable, optional): a mapping (dictionary) of (string: module)
-            or an iterable of key-value pairs of type (string, module)
-
-    Example::
-
-        class MyModule(nn.Layer):
-            def __init__(self):
-                super(MyModule, self).__init__()
-                self.choices = nn.LayerDict({
-                        'conv': nn.Conv2d(10, 10, 3),
-                        'pool': nn.MaxPool2d(3)
-                })
-                self.activations = nn.LayerDict([
-                        ['lrelu', nn.LeakyReLU()],
-                        ['prelu', nn.PReLU()]
-                ])
-
-            def forward(self, x, choice, act):
-                x = self.choices[choice](x)
-                x = self.activations[act](x)
-                return x
-    """
-
-    def __init__(self, modules: Optional[Mapping[str, Layer]]=None) -> None:
-        super(LayerDict, self).__init__()
-        if modules is not None:
-            self.update(modules)
-
-    def __getitem__(self, key: str) -> Layer:
-        return self._modules[key]
-
-    def __setitem__(self, key: str, module: Layer) -> None:
-        self.add_module(key, module)
-
-    def __delitem__(self, key: str) -> None:
-        del self._modules[key]
-
-    def __len__(self) -> int:
-        return len(self._modules)
-
-    def __iter__(self) -> Iterator[str]:
-        return iter(self._modules)
-
-    def __contains__(self, key: str) -> bool:
-        return key in self._modules
-
-    def clear(self) -> None:
-        """Remove all items from the LayerDict.
-        """
-        self._modules.clear()
-
-    def pop(self, key: str) -> Layer:
-        r"""Remove key from the LayerDict and return its module.
-
-        Args:
-            key (string): key to pop from the LayerDict
-        """
-        v = self[key]
-        del self[key]
-        return v
-
-    def keys(self) -> Iterable[str]:
-        r"""Return an iterable of the LayerDict keys.
-        """
-        return self._modules.keys()
-
-    def items(self) -> Iterable[Tuple[str, Layer]]:
-        r"""Return an iterable of the LayerDict key/value pairs.
-        """
-        return self._modules.items()
-
-    def values(self) -> Iterable[Layer]:
-        r"""Return an iterable of the LayerDict values.
-        """
-        return self._modules.values()
-
-    def update(self, modules: Mapping[str, Layer]) -> None:
-        r"""Update the :class:`~paddle.nn.LayerDict` with the key-value pairs from a
-        mapping or an iterable, overwriting existing keys.
-
-        .. note::
-            If :attr:`modules` is an ``OrderedDict``, a :class:`~paddle.nn.LayerDict`, or
-            an iterable of key-value pairs, the order of new elements in it is preserved.
-
-        Args:
-            modules (iterable): a mapping (dictionary) from string to :class:`~paddle.nn.Layer`,
-                or an iterable of key-value pairs of type (string, :class:`~paddle.nn.Layer`)
-        """
-        if not isinstance(modules, container_abcs.Iterable):
-            raise TypeError("LayerDict.update should be called with an "
-                            "iterable of key/value pairs, but got " + type(
-                                modules).__name__)
-
-        if isinstance(modules,
-                      (OrderedDict, LayerDict, container_abcs.Mapping)):
-            for key, module in modules.items():
-                self[key] = module
-        else:
-            # modules here can be a list with two items
-            for j, m in enumerate(modules):
-                if not isinstance(m, container_abcs.Iterable):
-                    raise TypeError("LayerDict update sequence element "
-                                    "#" + str(j) + " should be Iterable; is" +
-                                    type(m).__name__)
-                if not len(m) == 2:
-                    raise ValueError("LayerDict update sequence element "
-                                     "#" + str(j) + " has length " + str(
-                                         len(m)) + "; 2 is required")
-                # modules can be Mapping (what it's typed at), or a list: [(name1, module1), (name2, module2)]
-                # that's too cumbersome to type correctly with overloads, so we add an ignore here
-                self[m[0]] = m[1]  # type: ignore[assignment]
-
-    # remove forward alltogether to fallback on Module's _forward_unimplemented
-
-
-if not hasattr(paddle.nn, 'LayerDict'):
-    logger.debug(
-        "register user LayerDict to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'LayerDict', LayerDict)
--- a/deepspeech/decoders/beam_search/beam_search.py
+++ b/deepspeech/decoders/beam_search/beam_search.py
--- a/deepspeech/decoders/ctcdecoder/tests/test_decoders.py
+++ b/deepspeech/decoders/ctcdecoder/tests/test_decoders.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Test decoders."""
-import unittest
-
-from deepspeech.decoders import decoders_deprecated as decoder
-
-
-class TestDecoders(unittest.TestCase):
-    def setUp(self):
-        self.vocab_list = ["\'", ' ', 'a', 'b', 'c', 'd']
-        self.beam_size = 20
-        self.probs_seq1 = [[
-            0.06390443, 0.21124858, 0.27323887, 0.06870235, 0.0361254,
-            0.18184413, 0.16493624
-        ], [
-            0.03309247, 0.22866108, 0.24390638, 0.09699597, 0.31895462,
-            0.0094893, 0.06890021
-        ], [
-            0.218104, 0.19992557, 0.18245131, 0.08503348, 0.14903535,
-            0.08424043, 0.08120984
-        ], [
-            0.12094152, 0.19162472, 0.01473646, 0.28045061, 0.24246305,
-            0.05206269, 0.09772094
-        ], [
-            0.1333387, 0.00550838, 0.00301669, 0.21745861, 0.20803985,
-            0.41317442, 0.01946335
-        ], [
-            0.16468227, 0.1980699, 0.1906545, 0.18963251, 0.19860937,
-            0.04377724, 0.01457421
-        ]]
-        self.probs_seq2 = [[
-            0.08034842, 0.22671944, 0.05799633, 0.36814645, 0.11307441,
-            0.04468023, 0.10903471
-        ], [
-            0.09742457, 0.12959763, 0.09435383, 0.21889204, 0.15113123,
-            0.10219457, 0.20640612
-        ], [
-            0.45033529, 0.09091417, 0.15333208, 0.07939558, 0.08649316,
-            0.12298585, 0.01654384
-        ], [
-            0.02512238, 0.22079203, 0.19664364, 0.11906379, 0.07816055,
-            0.22538587, 0.13483174
-        ], [
-            0.17928453, 0.06065261, 0.41153005, 0.1172041, 0.11880313,
-            0.07113197, 0.04139363
-        ], [
-            0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306,
-            0.05294827, 0.22298418
-        ]]
-        self.greedy_result = ["ac'bdc", "b'da"]
-        self.beam_search_result = ['acdc', "b'a"]
-
-    def test_greedy_decoder_1(self):
-        bst_result = decoder.ctc_greedy_decoder(self.probs_seq1,
-                                                self.vocab_list)
-        self.assertEqual(bst_result, self.greedy_result[0])
-
-    def test_greedy_decoder_2(self):
-        bst_result = decoder.ctc_greedy_decoder(self.probs_seq2,
-                                                self.vocab_list)
-        self.assertEqual(bst_result, self.greedy_result[1])
-
-    def test_beam_search_decoder_1(self):
-        beam_result = decoder.ctc_beam_search_decoder(
-            probs_seq=self.probs_seq1,
-            beam_size=self.beam_size,
-            vocabulary=self.vocab_list)
-        self.assertEqual(beam_result[0][1], self.beam_search_result[0])
-
-    def test_beam_search_decoder_2(self):
-        beam_result = decoder.ctc_beam_search_decoder(
-            probs_seq=self.probs_seq2,
-            beam_size=self.beam_size,
-            vocabulary=self.vocab_list)
-        self.assertEqual(beam_result[0][1], self.beam_search_result[1])
-
-    def test_beam_search_decoder_batch(self):
-        beam_results = decoder.ctc_beam_search_decoder_batch(
-            probs_split=[self.probs_seq1, self.probs_seq2],
-            beam_size=self.beam_size,
-            vocabulary=self.vocab_list,
-            num_processes=24)
-        self.assertEqual(beam_results[0][0][1], self.beam_search_result[0])
-        self.assertEqual(beam_results[1][0][1], self.beam_search_result[1])
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/deepspeech/decoders/recog.py
+++ b/deepspeech/decoders/recog.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
-import jsonlines
-import paddle
-from yacs.config import CfgNode
-
-from .beam_search import BatchBeamSearch
-from .beam_search import BeamSearch
-from .scorers.length_bonus import LengthBonus
-from .scorers.scorer_interface import BatchScorerInterface
-from .utils import add_results_to_json
-from deepspeech.exps import dynamic_import_tester
-from deepspeech.io.reader import LoadInputsAndTargets
-from deepspeech.models.asr_interface import ASRInterface
-from deepspeech.models.lm_interface import dynamic_import_lm
-from deepspeech.utils.log import Log
-
-logger = Log(__name__).getlog()
-
-# NOTE: you need this func to generate our sphinx doc
-
-
-def get_config(config_path):
-    confs = CfgNode(new_allowed=True)
-    confs.merge_from_file(config_path)
-    return confs
-
-
-def load_trained_model(args):
-    args.nprocs = args.ngpu
-    confs = get_config(args.model_conf)
-    class_obj = dynamic_import_tester(args.model_name)
-    exp = class_obj(confs, args)
-    with exp.eval():
-        exp.setup()
-        exp.restore()
-    char_list = exp.args.char_list
-    model = exp.model
-    return model, char_list, exp, confs
-
-
-def load_trained_lm(args):
-    lm_args = get_config(args.rnnlm_conf)
-    lm_model_module = lm_args.model_module
-    lm_class = dynamic_import_lm(lm_model_module)
-    lm = lm_class(**lm_args.model)
-    model_dict = paddle.load(args.rnnlm)
-    lm.set_state_dict(model_dict)
-    return lm
-
-
-def recog_v2(args):
-    """Decode with custom models that implements ScorerInterface.
-
-    Args:
-        args (namespace): The program arguments.
-        See py:func:`bin.asr_recog.get_parser` for details
-
-    """
-    logger.warning("experimental API for custom LMs is selected by --api v2")
-    if args.batchsize > 1:
-        raise NotImplementedError("multi-utt batch decoding is not implemented")
-    if args.streaming_mode is not None:
-        raise NotImplementedError("streaming mode is not implemented")
-    if args.word_rnnlm:
-        raise NotImplementedError("word LM is not implemented")
-
-    # set_deterministic(args)
-    model, char_list, exp, confs = load_trained_model(args)
-    assert isinstance(model, ASRInterface)
-
-    load_inputs_and_targets = LoadInputsAndTargets(
-        mode="asr",
-        load_output=False,
-        sort_in_input_length=False,
-        preprocess_conf=confs.collator.augmentation_config
-        if args.preprocess_conf is None else args.preprocess_conf,
-        preprocess_args={"train": False}, )
-
-    if args.rnnlm:
-        lm = load_trained_lm(args)
-        lm.eval()
-    else:
-        lm = None
-
-    if args.ngram_model:
-        from .scorers.ngram import NgramFullScorer
-        from .scorers.ngram import NgramPartScorer
-
-        if args.ngram_scorer == "full":
-            ngram = NgramFullScorer(args.ngram_model, char_list)
-        else:
-            ngram = NgramPartScorer(args.ngram_model, char_list)
-    else:
-        ngram = None
-
-    scorers = model.scorers()  # decoder
-    scorers["lm"] = lm
-    scorers["ngram"] = ngram
-    scorers["length_bonus"] = LengthBonus(len(char_list))
-    weights = dict(
-        decoder=1.0 - args.ctc_weight,
-        ctc=args.ctc_weight,
-        lm=args.lm_weight,
-        ngram=args.ngram_weight,
-        length_bonus=args.penalty, )
-    beam_search = BeamSearch(
-        beam_size=args.beam_size,
-        vocab_size=len(char_list),
-        weights=weights,
-        scorers=scorers,
-        sos=model.sos,
-        eos=model.eos,
-        token_list=char_list,
-        pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", )
-
-    # TODO(karita): make all scorers batchfied
-    if args.batchsize == 1:
-        non_batch = [
-            k for k, v in beam_search.full_scorers.items()
-            if not isinstance(v, BatchScorerInterface)
-        ]
-        if len(non_batch) == 0:
-            beam_search.__class__ = BatchBeamSearch
-            logger.info("BatchBeamSearch implementation is selected.")
-        else:
-            logger.warning(f"As non-batch scorers {non_batch} are found, "
-                           f"fall back to non-batch implementation.")
-
-    if args.ngpu > 1:
-        raise NotImplementedError("only single GPU decoding is supported")
-    if args.ngpu == 1:
-        device = "gpu:0"
-    else:
-        device = "cpu"
-    paddle.set_device(device)
-    dtype = getattr(paddle, args.dtype)
-    logger.info(f"Decoding device={device}, dtype={dtype}")
-    model.to(device=device, dtype=dtype)
-    model.eval()
-    beam_search.to(device=device, dtype=dtype)
-    beam_search.eval()
-
-    # read json data
-    js = []
-    with jsonlines.open(args.recog_json, "r") as reader:
-        for item in reader:
-            js.append(item)
-    # jsonlines to dict, key by 'utt', value by jsonline
-    js = {item['utt']: item for item in js}
-
-    new_js = {}
-    with paddle.no_grad():
-        with jsonlines.open(args.result_label, "w") as f:
-            for idx, name in enumerate(js.keys(), 1):
-                logger.info(f"({idx}/{len(js.keys())}) decoding " + name)
-                batch = [(name, js[name])]
-                feat = load_inputs_and_targets(batch)[0][0]
-                logger.info(f'feat: {feat.shape}')
-                enc = model.encode(paddle.to_tensor(feat).to(dtype))
-                logger.info(f'eout: {enc.shape}')
-                nbest_hyps = beam_search(
-                    x=enc,
-                    maxlenratio=args.maxlenratio,
-                    minlenratio=args.minlenratio)
-                nbest_hyps = [
-                    h.asdict()
-                    for h in nbest_hyps[:min(len(nbest_hyps), args.nbest)]
-                ]
-                new_js[name] = add_results_to_json(js[name], nbest_hyps,
-                                                   char_list)
-
-                item = new_js[name]['output'][0]  # 1-best
-                ref = item['text']
-                rec_text = item['rec_text'].replace('▁', ' ').replace(
-                    '<eos>', '').strip()
-                rec_tokenid = list(map(int, item['rec_tokenid'].split()))
-                f.write({
-                    "utt": name,
-                    "refs": [ref],
-                    "hyps": [rec_text],
-                    "hyps_tokenid": [rec_tokenid],
-                })
--- a/deepspeech/decoders/recog_bin.py
+++ b/deepspeech/decoders/recog_bin.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""End-to-end speech recognition model decoding script."""
-import logging
-import os
-import random
-import sys
-from distutils.util import strtobool
-
-import configargparse
-import numpy as np
-
-
-def get_parser():
-    """Get default arguments."""
-    parser = configargparse.ArgumentParser(
-        description="Transcribe text from speech using "
-        "a speech recognition model on one CPU or GPU",
-        config_file_parser_class=configargparse.YAMLConfigFileParser,
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter, )
-    parser.add(
-        '--model-name',
-        type=str,
-        default='u2_kaldi',
-        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
-    # general configuration
-    parser.add("--config", is_config_file=True, help="Config file path")
-    parser.add(
-        "--config2",
-        is_config_file=True,
-        help="Second config file path that overwrites the settings in `--config`",
-    )
-    parser.add(
-        "--config3",
-        is_config_file=True,
-        help="Third config file path that overwrites the settings "
-        "in `--config` and `--config2`", )
-
-    parser.add_argument("--ngpu", type=int, default=0, help="Number of GPUs")
-    parser.add_argument(
-        "--dtype",
-        choices=("float16", "float32", "float64"),
-        default="float32",
-        help="Float precision (only available in --api v2)", )
-    parser.add_argument("--debugmode", type=int, default=1, help="Debugmode")
-    parser.add_argument("--seed", type=int, default=1, help="Random seed")
-    parser.add_argument(
-        "--verbose", "-V", type=int, default=2, help="Verbose option")
-    parser.add_argument(
-        "--batchsize",
-        type=int,
-        default=1,
-        help="Batch size for beam search (0: means no batch processing)", )
-    parser.add_argument(
-        "--preprocess-conf",
-        type=str,
-        default=None,
-        help="The configuration file for the pre-processing", )
-    parser.add_argument(
-        "--api",
-        default="v2",
-        choices=["v2"],
-        help="Beam search APIs "
-        "v2: Experimental API. It supports any models that implements ScorerInterface.",
-    )
-    # task related
-    parser.add_argument(
-        "--recog-json", type=str, help="Filename of recognition data (json)")
-    parser.add_argument(
-        "--result-label",
-        type=str,
-        required=True,
-        help="Filename of result label data (json)", )
-    # model (parameter) related
-    parser.add_argument(
-        "--model",
-        type=str,
-        required=True,
-        help="Model file parameters to read")
-    parser.add_argument(
-        "--model-conf", type=str, default=None, help="Model config file")
-    parser.add_argument(
-        "--num-spkrs",
-        type=int,
-        default=1,
-        choices=[1, 2],
-        help="Number of speakers in the speech", )
-    parser.add_argument(
-        "--num-encs",
-        default=1,
-        type=int,
-        help="Number of encoders in the model.")
-    # search related
-    parser.add_argument(
-        "--nbest", type=int, default=1, help="Output N-best hypotheses")
-    parser.add_argument("--beam-size", type=int, default=1, help="Beam size")
-    parser.add_argument(
-        "--penalty", type=float, default=0.0, help="Incertion penalty")
-    parser.add_argument(
-        "--maxlenratio",
-        type=float,
-        default=0.0,
-        help="""Input length ratio to obtain max output length.
-                        If maxlenratio=0.0 (default), it uses a end-detect function
-                        to automatically find maximum hypothesis lengths.
-                        If maxlenratio<0.0, its absolute value is interpreted
-                        as a constant max output length""", )
-    parser.add_argument(
-        "--minlenratio",
-        type=float,
-        default=0.0,
-        help="Input length ratio to obtain min output length", )
-    parser.add_argument(
-        "--ctc-weight",
-        type=float,
-        default=0.0,
-        help="CTC weight in joint decoding")
-    parser.add_argument(
-        "--weights-ctc-dec",
-        type=float,
-        action="append",
-        help="ctc weight assigned to each encoder during decoding."
-        "[in multi-encoder mode only]", )
-    parser.add_argument(
-        "--ctc-window-margin",
-        type=int,
-        default=0,
-        help="""Use CTC window with margin parameter to accelerate
-                        CTC/attention decoding especially on GPU. Smaller magin
-                        makes decoding faster, but may increase search errors.
-                        If margin=0 (default), this function is disabled""", )
-    # transducer related
-    parser.add_argument(
-        "--search-type",
-        type=str,
-        default="default",
-        choices=["default", "nsc", "tsd", "alsd", "maes"],
-        help="""Type of beam search implementation to use during inference.
-        Can be either: default beam search ("default"),
-        N-Step Constrained beam search ("nsc"), Time-Synchronous Decoding ("tsd"),
-        Alignment-Length Synchronous Decoding ("alsd") or
-        modified Adaptive Expansion Search ("maes").""", )
-    parser.add_argument(
-        "--nstep",
-        type=int,
-        default=1,
-        help="""Number of expansion steps allowed in NSC beam search or mAES
-        (nstep > 0 for NSC and nstep > 1 for mAES).""", )
-    parser.add_argument(
-        "--prefix-alpha",
-        type=int,
-        default=2,
-        help="Length prefix difference allowed in NSC beam search or mAES.", )
-    parser.add_argument(
-        "--max-sym-exp",
-        type=int,
-        default=2,
-        help="Number of symbol expansions allowed in TSD.", )
-    parser.add_argument(
-        "--u-max",
-        type=int,
-        default=400,
-        help="Length prefix difference allowed in ALSD.", )
-    parser.add_argument(
-        "--expansion-gamma",
-        type=float,
-        default=2.3,
-        help="Allowed logp difference for prune-by-value method in mAES.", )
-    parser.add_argument(
-        "--expansion-beta",
-        type=int,
-        default=2,
-        help="""Number of additional candidates for expanded hypotheses
-                selection in mAES.""", )
-    parser.add_argument(
-        "--score-norm",
-        type=strtobool,
-        nargs="?",
-        default=True,
-        help="Normalize final hypotheses' score by length", )
-    parser.add_argument(
-        "--softmax-temperature",
-        type=float,
-        default=1.0,
-        help="Penalization term for softmax function.", )
-    # rnnlm related
-    parser.add_argument(
-        "--rnnlm", type=str, default=None, help="RNNLM model file to read")
-    parser.add_argument(
-        "--rnnlm-conf",
-        type=str,
-        default=None,
-        help="RNNLM model config file to read")
-    parser.add_argument(
-        "--word-rnnlm",
-        type=str,
-        default=None,
-        help="Word RNNLM model file to read")
-    parser.add_argument(
-        "--word-rnnlm-conf",
-        type=str,
-        default=None,
-        help="Word RNNLM model config file to read", )
-    parser.add_argument(
-        "--word-dict", type=str, default=None, help="Word list to read")
-    parser.add_argument(
-        "--lm-weight", type=float, default=0.1, help="RNNLM weight")
-    # ngram related
-    parser.add_argument(
-        "--ngram-model",
-        type=str,
-        default=None,
-        help="ngram model file to read")
-    parser.add_argument(
-        "--ngram-weight", type=float, default=0.1, help="ngram weight")
-    parser.add_argument(
-        "--ngram-scorer",
-        type=str,
-        default="part",
-        choices=("full", "part"),
-        help="""if the ngram is set as a part scorer, similar with CTC scorer,
-                ngram scorer only scores topK hypethesis.
-                if the ngram is set as full scorer, ngram scorer scores all hypthesis
-                the decoding speed of part scorer is musch faster than full one""",
-    )
-    # streaming related
-    parser.add_argument(
-        "--streaming-mode",
-        type=str,
-        default=None,
-        choices=["window", "segment"],
-        help="""Use streaming recognizer for inference.
-                        `--batchsize` must be set to 0 to enable this mode""", )
-    parser.add_argument(
-        "--streaming-window", type=int, default=10, help="Window size")
-    parser.add_argument(
-        "--streaming-min-blank-dur",
-        type=int,
-        default=10,
-        help="Minimum blank duration threshold", )
-    parser.add_argument(
-        "--streaming-onset-margin", type=int, default=1, help="Onset margin")
-    parser.add_argument(
-        "--streaming-offset-margin", type=int, default=1, help="Offset margin")
-    # non-autoregressive related
-    # Mask CTC related. See https://arxiv.org/abs/2005.08700 for the detail.
-    parser.add_argument(
-        "--maskctc-n-iterations",
-        type=int,
-        default=10,
-        help="Number of decoding iterations."
-        "For Mask CTC, set 0 to predict 1 mask/iter.", )
-    parser.add_argument(
-        "--maskctc-probability-threshold",
-        type=float,
-        default=0.999,
-        help="Threshold probability for CTC output", )
-    # quantize model related
-    parser.add_argument(
-        "--quantize-config",
-        nargs="*",
-        help="Quantize config list. E.g.: --quantize-config=[Linear,LSTM,GRU]",
-    )
-    parser.add_argument(
-        "--quantize-dtype",
-        type=str,
-        default="qint8",
-        help="Dtype dynamic quantize")
-    parser.add_argument(
-        "--quantize-asr-model",
-        type=bool,
-        default=False,
-        help="Quantize asr model", )
-    parser.add_argument(
-        "--quantize-lm-model",
-        type=bool,
-        default=False,
-        help="Quantize lm model", )
-    return parser
-
-
-def main(args):
-    """Run the main decoding function."""
-    parser = get_parser()
-    parser.add_argument(
-        "--output", metavar="CKPT_DIR", help="path to save checkpoint.")
-    parser.add_argument(
-        "--checkpoint_path", type=str, help="path to load checkpoint")
-    parser.add_argument("--dict-path", type=str, help="path to load checkpoint")
-    args = parser.parse_args(args)
-
-    if args.ngpu == 0 and args.dtype == "float16":
-        raise ValueError(
-            f"--dtype {args.dtype} does not support the CPU backend.")
-
-    # logging info
-    if args.verbose == 1:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-        )
-    elif args.verbose == 2:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-        )
-    else:
-        logging.basicConfig(
-            level=logging.WARN,
-            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-        )
-        logging.warning("Skip DEBUG/INFO messages")
-    logging.info(args)
-
-    # check CUDA_VISIBLE_DEVICES
-    if args.ngpu > 0:
-        cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
-        if cvd is None:
-            logging.warning("CUDA_VISIBLE_DEVICES is not set.")
-        elif args.ngpu != len(cvd.split(",")):
-            logging.error("#gpus is not matched with CUDA_VISIBLE_DEVICES.")
-            sys.exit(1)
-
-        # TODO(mn5k): support of multiple GPUs
-        if args.ngpu > 1:
-            logging.error("The program only supports ngpu=1.")
-            sys.exit(1)
-
-    # display PYTHONPATH
-    logging.info("python path = " + os.environ.get("PYTHONPATH", "(None)"))
-
-    # seed setting
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    logging.info("set random seed = %d" % args.seed)
-
-    # validate rnn options
-    if args.rnnlm is not None and args.word_rnnlm is not None:
-        logging.error(
-            "It seems that both --rnnlm and --word-rnnlm are specified. "
-            "Please use either option.")
-        sys.exit(1)
-
-    # recog
-    if args.num_spkrs == 1:
-        if args.num_encs == 1:
-            # Experimental API that supports custom LMs
-            if args.api == "v2":
-                from deepspeech.decoders.recog import recog_v2
-                recog_v2(args)
-            else:
-                raise ValueError("Only support --api v2")
-        else:
-            if args.api == "v2":
-                raise NotImplementedError(
-                    f"--num-encs {args.num_encs} > 1 is not supported in --api v2"
-                )
-    elif args.num_spkrs == 2:
-        raise ValueError("asr_mix not supported.")
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
--- a/deepspeech/decoders/scorers/ctc.py
+++ b/deepspeech/decoders/scorers/ctc.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""ScorerInterface implementation for CTC."""
-import numpy as np
-import paddle
-
-from .ctc_prefix_score import CTCPrefixScore
-from .ctc_prefix_score import CTCPrefixScorePD
-from .scorer_interface import BatchPartialScorerInterface
-
-
-class CTCPrefixScorer(BatchPartialScorerInterface):
-    """Decoder interface wrapper for CTCPrefixScore."""
-
-    def __init__(self, ctc: paddle.nn.Layer, eos: int):
-        """Initialize class.
-
-        Args:
-            ctc (paddle.nn.Layer): The CTC implementation.
-                For example, :class:`deepspeech.modules.ctc.CTC`
-            eos (int): The end-of-sequence id.
-
-        """
-        self.ctc = ctc
-        self.eos = eos
-        self.impl = None
-
-    def init_state(self, x: paddle.Tensor):
-        """Get an initial state for decoding.
-
-        Args:
-            x (paddle.Tensor): The encoded feature tensor
-
-        Returns: initial state
-
-        """
-        logp = self.ctc.log_softmax(x.unsqueeze(0)).squeeze(0).numpy()
-        # TODO(karita): use CTCPrefixScorePD
-        self.impl = CTCPrefixScore(logp, 0, self.eos, np)
-        return 0, self.impl.initial_state()
-
-    def select_state(self, state, i, new_id=None):
-        """Select state with relative ids in the main beam search.
-
-        Args:
-            state: Decoder state for prefix tokens
-            i (int): Index to select a state in the main beam search
-            new_id (int): New label id to select a state if necessary
-
-        Returns:
-            state: pruned state
-
-        """
-        if type(state) == tuple:
-            if len(state) == 2:  # for CTCPrefixScore
-                sc, st = state
-                return sc[i], st[i]
-            else:  # for CTCPrefixScorePD (need new_id > 0)
-                r, log_psi, f_min, f_max, scoring_idmap = state
-                s = log_psi[i, new_id].expand(log_psi.size(1))
-                if scoring_idmap is not None:
-                    return r[:, :, i, scoring_idmap[i, new_id]], s, f_min, f_max
-                else:
-                    return r[:, :, i, new_id], s, f_min, f_max
-        return None if state is None else state[i]
-
-    def score_partial(self, y, ids, state, x):
-        """Score new token.
-
-        Args:
-            y (paddle.Tensor): 1D prefix token
-            next_tokens (paddle.Tensor): paddle.int64 next token to score
-            state: decoder state for prefix tokens
-            x (paddle.Tensor): 2D encoder feature that generates ys
-
-        Returns:
-            tuple[paddle.Tensor, Any]:
-                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
-                and next state for ys
-
-        """
-        prev_score, state = state
-        presub_score, new_st = self.impl(y.cpu(), ids.cpu(), state)
-        tscore = paddle.to_tensor(
-            presub_score - prev_score, place=x.place, dtype=x.dtype)
-        return tscore, (presub_score, new_st)
-
-    def batch_init_state(self, x: paddle.Tensor):
-        """Get an initial state for decoding.
-
-        Args:
-            x (paddle.Tensor): The encoded feature tensor
-
-        Returns: initial state
-
-        """
-        logp = self.ctc.log_softmax(x.unsqueeze(0))  # assuming batch_size = 1
-        xlen = paddle.to_tensor([logp.size(1)])
-        self.impl = CTCPrefixScorePD(logp, xlen, 0, self.eos)
-        return None
-
-    def batch_score_partial(self, y, ids, state, x):
-        """Score new token.
-
-        Args:
-            y (paddle.Tensor): 1D prefix token
-            ids (paddle.Tensor): paddle.int64 next token to score
-            state: decoder state for prefix tokens
-            x (paddle.Tensor): 2D encoder feature that generates ys
-
-        Returns:
-            tuple[paddle.Tensor, Any]:
-                Tuple of a score tensor for y that has a shape `(len(next_tokens),)`
-                and next state for ys
-
-        """
-        batch_state = (
-            (paddle.stack([s[0] for s in state], axis=2),
-             paddle.stack([s[1] for s in state]), state[0][2], state[0][3], )
-            if state[0] is not None else None)
-        return self.impl(y, batch_state, ids)
-
-    def extend_prob(self, x: paddle.Tensor):
-        """Extend probs for decoding.
-
-        This extension is for streaming decoding
-        as in Eq (14) in https://arxiv.org/abs/2006.14941
-
-        Args:
-            x (paddle.Tensor): The encoded feature tensor
-
-        """
-        logp = self.ctc.log_softmax(x.unsqueeze(0))
-        self.impl.extend_prob(logp)
-
-    def extend_state(self, state):
-        """Extend state for decoding.
-
-        This extension is for streaming decoding
-        as in Eq (14) in https://arxiv.org/abs/2006.14941
-
-        Args:
-            state: The states of hyps
-
-        Returns: exteded state
-
-        """
-        new_state = []
-        for s in state:
-            new_state.append(self.impl.extend_state(s))
-
-        return new_state
--- a/deepspeech/decoders/utils.py
+++ b/deepspeech/decoders/utils.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import numpy as np
-
-from deepspeech.utils.log import Log
-logger = Log(__name__).getlog()
-
-__all__ = ["end_detect", "parse_hypothesis", "add_results_to_json"]
-
-
-def end_detect(ended_hyps, i, M=3, D_end=np.log(1 * np.exp(-10))):
-    """End detection.
-
-    described in Eq. (50) of S. Watanabe et al
-    "Hybrid CTC/Attention Architecture for End-to-End Speech Recognition"
-
-    :param ended_hyps: dict
-    :param i: int
-    :param M: int
-    :param D_end: float
-    :return: bool
-    """
-    if len(ended_hyps) == 0:
-        return False
-    count = 0
-    best_hyp = sorted(ended_hyps, key=lambda x: x["score"], reverse=True)[0]
-    for m in range(M):
-        # get ended_hyps with their length is i - m
-        hyp_length = i - m
-        hyps_same_length = [
-            x for x in ended_hyps if len(x["yseq"]) == hyp_length
-        ]
-        if len(hyps_same_length) > 0:
-            best_hyp_same_length = sorted(
-                hyps_same_length, key=lambda x: x["score"], reverse=True)[0]
-            if best_hyp_same_length["score"] - best_hyp["score"] < D_end:
-                count += 1
-
-    if count == M:
-        return True
-    else:
-        return False
-
-
-# * ------------------ recognition related ------------------ *
-def parse_hypothesis(hyp, char_list):
-    """Parse hypothesis.
-
-    Args:
-        hyp (list[dict[str, Any]]): Recognition hypothesis.
-        char_list (list[str]): List of characters.
-
-    Returns:
-        tuple(str, str, str, float)
-
-    """
-    # remove sos and get results
-    tokenid_as_list = list(map(int, hyp["yseq"][1:]))
-    token_as_list = [char_list[idx] for idx in tokenid_as_list]
-    score = float(hyp["score"])
-
-    # convert to string
-    tokenid = " ".join([str(idx) for idx in tokenid_as_list])
-    token = " ".join(token_as_list)
-    text = "".join(token_as_list).replace("<space>", " ")
-
-    return text, token, tokenid, score
-
-
-def add_results_to_json(js, nbest_hyps, char_list):
-    """Add N-best results to json.
-
-    Args:
-        js (dict[str, Any]): Groundtruth utterance dict.
-        nbest_hyps_sd (list[dict[str, Any]]):
-            List of hypothesis for multi_speakers: nutts x nspkrs.
-        char_list (list[str]): List of characters.
-
-    Returns:
-        dict[str, Any]: N-best results added utterance dict.
-
-    """
-    # copy old json info
-    new_js = dict()
-    new_js["utt2spk"] = js["utt2spk"]
-    new_js["output"] = []
-
-    for n, hyp in enumerate(nbest_hyps, 1):
-        # parse hypothesis
-        rec_text, rec_token, rec_tokenid, score = parse_hypothesis(hyp,
-                                                                   char_list)
-
-        # copy ground-truth
-        if len(js["output"]) > 0:
-            out_dic = dict(js["output"][0].items())
-        else:
-            # for no reference case (e.g., speech translation)
-            out_dic = {"name": ""}
-
-        # update name
-        out_dic["name"] += "[%d]" % n
-
-        # add recognition results
-        out_dic["rec_text"] = rec_text
-        out_dic["rec_token"] = rec_token
-        out_dic["rec_tokenid"] = rec_tokenid
-        out_dic["score"] = score
-
-        # add to list of N-best result dicts
-        new_js["output"].append(out_dic)
-
-        # show 1-best result
-        if n == 1:
-            if "text" in out_dic.keys():
-                logger.info("groundtruth: %s" % out_dic["text"])
-            logger.info("prediction : %s" % out_dic["rec_text"])
-
-    return new_js
--- a/deepspeech/exps/__init__.py
+++ b/deepspeech/exps/__init__.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from deepspeech.training.trainer import Trainer
-from deepspeech.utils.dynamic_import import dynamic_import
-
-model_trainer_alias = {
-    "ds2": "deepspeech.exp.deepspeech2.model:DeepSpeech2Trainer",
-    "u2": "deepspeech.exps.u2.model:U2Trainer",
-    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Trainer",
-    "u2_st": "deepspeech.exps.u2_st.model:U2STTrainer",
-}
-
-
-def dynamic_import_trainer(module):
-    """Import Trainer dynamically.
-
-    Args:
-        module (str): trainer name. e.g., ds2, u2, u2_kaldi
-
-    Returns:
-        type: Trainer class
-
-    """
-    model_class = dynamic_import(module, model_trainer_alias)
-    assert issubclass(model_class,
-                      Trainer), f"{module} does not implement Trainer"
-    return model_class
-
-
-model_tester_alias = {
-    "ds2": "deepspeech.exp.deepspeech2.model:DeepSpeech2Tester",
-    "u2": "deepspeech.exps.u2.model:U2Tester",
-    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester",
-    "u2_st": "deepspeech.exps.u2_st.model:U2STTester",
-}
-
-
-def dynamic_import_tester(module):
-    """Import Tester dynamically.
-
-    Args:
-        module (str): tester name. e.g., ds2, u2, u2_kaldi
-
-    Returns:
-        type: Tester class
-
-    """
-    model_class = dynamic_import(module, model_tester_alias)
-    assert issubclass(model_class,
-                      Trainer), f"{module} does not implement Tester"
-    return model_class
--- a/deepspeech/exps/deepspeech2/bin/deploy/client.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/client.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Client-end for the ASR demo."""
-import argparse
-import sys
-
-import keyboard
-import pyaudio
-
-from deepspeech.utils.socket_server import socket_send
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--host_ip",
-    default="localhost",
-    type=str,
-    help="Server IP address. (default: %(default)s)")
-parser.add_argument(
-    "--host_port",
-    default=8086,
-    type=int,
-    help="Server Port. (default: %(default)s)")
-args = parser.parse_args()
-
-is_recording = False
-enable_trigger_record = True
-
-
-def on_press_release(x):
-    """Keyboard callback function."""
-    global is_recording, enable_trigger_record
-    press = keyboard.KeyboardEvent('down', 28, 'space')
-    release = keyboard.KeyboardEvent('up', 28, 'space')
-    if x.event_type == 'down' and x.name == press.name:
-        if (not is_recording) and enable_trigger_record:
-            sys.stdout.write("Start Recording ... ")
-            sys.stdout.flush()
-            is_recording = True
-    if x.event_type == 'up' and x.name == release.name:
-        if is_recording:
-            is_recording = False
-
-
-data_list = []
-
-
-def callback(in_data, frame_count, time_info, status):
-    """Audio recorder's stream callback function."""
-    global data_list, is_recording, enable_trigger_record
-    if is_recording:
-        data_list.append(in_data)
-        enable_trigger_record = False
-    elif len(data_list) > 0:
-        socket_send(args.host_ip, args.host_port, ''.join(data_list))
-        data_list = []
-    enable_trigger_record = True
-    return (in_data, pyaudio.paContinue)
-
-
-def main():
-    # prepare audio recorder
-    p = pyaudio.PyAudio()
-    stream = p.open(
-        format=pyaudio.paInt16,
-        channels=1,
-        rate=16000,
-        input=True,
-        stream_callback=callback)
-    stream.start_stream()
-
-    # prepare keyboard listener
-    while (1):
-        keyboard.hook(on_press_release)
-        if keyboard.record('esc'):
-            break
-
-    # close up
-    stream.stop_stream()
-    stream.close()
-    p.terminate()
-
-
-if __name__ == "__main__":
-    main()
--- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Server-end for the ASR demo."""
-import functools
-
-import numpy as np
-import paddle
-from paddle.inference import Config
-from paddle.inference import create_predictor
-from paddle.io import DataLoader
-
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.io.dataset import ManifestDataset
-from deepspeech.models.ds2 import DeepSpeech2Model
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils.socket_server import AsrRequestHandler
-from deepspeech.utils.socket_server import AsrTCPServer
-from deepspeech.utils.socket_server import warm_up_test
-from deepspeech.utils.utility import add_arguments
-from deepspeech.utils.utility import print_arguments
-
-
-def init_predictor(args):
-    if args.model_dir is not None:
-        config = Config(args.model_dir)
-    else:
-        config = Config(args.model_file, args.params_file)
-
-    config.enable_memory_optim()
-    if args.use_gpu:
-        config.enable_use_gpu(memory_pool_init_size_mb=1000, device_id=0)
-    else:
-        # If not specific mkldnn, you can set the blas thread.
-        # The thread num should not be greater than the number of cores in the CPU.
-        config.set_cpu_math_library_num_threads(4)
-        config.enable_mkldnn()
-
-    predictor = create_predictor(config)
-    return predictor
-
-
-def run(predictor, img):
-    # copy img data to input tensor
-    input_names = predictor.get_input_names()
-    for i, name in enumerate(input_names):
-        input_tensor = predictor.get_input_handle(name)
-        #input_tensor.reshape(img[i].shape)
-        #input_tensor.copy_from_cpu(img[i].copy())
-
-    # do the inference
-    predictor.run()
-
-    results = []
-    # get out data from output tensor
-    output_names = predictor.get_output_names()
-    for i, name in enumerate(output_names):
-        output_tensor = predictor.get_output_handle(name)
-        output_data = output_tensor.copy_to_cpu()
-        results.append(output_data)
-
-    return results
-
-
-def inference(config, args):
-    predictor = init_predictor(args)
-
-
-def start_server(config, args):
-    """Start the ASR server"""
-    config.defrost()
-    config.data.manifest = config.data.test_manifest
-    dataset = ManifestDataset.from_config(config)
-
-    config.collator.augmentation_config = ""
-    config.collator.keep_transcription_text = True
-    config.collator.batch_size = 1
-    config.collator.num_workers = 0
-    collate_fn = SpeechCollator.from_config(config)
-    test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
-
-    model = DeepSpeech2Model.from_pretrained(test_loader, config,
-                                             args.checkpoint_path)
-    model.eval()
-
-    # prepare ASR inference handler
-    def file_to_transcript(filename):
-        feature = test_loader.collate_fn.process_utterance(filename, "")
-        audio = np.array([feature[0]]).astype('float32')  #[1, T, D]
-        audio_len = feature[0].shape[0]
-        audio_len = np.array([audio_len]).astype('int64')  # [1]
-
-        result_transcript = model.decode(
-            paddle.to_tensor(audio),
-            paddle.to_tensor(audio_len),
-            vocab_list=test_loader.collate_fn.vocab_list,
-            decoding_method=config.decoding.decoding_method,
-            lang_model_path=config.decoding.lang_model_path,
-            beam_alpha=config.decoding.alpha,
-            beam_beta=config.decoding.beta,
-            beam_size=config.decoding.beam_size,
-            cutoff_prob=config.decoding.cutoff_prob,
-            cutoff_top_n=config.decoding.cutoff_top_n,
-            num_processes=config.decoding.num_proc_bsearch)
-        return result_transcript[0]
-
-    # warming up with utterrances sampled from Librispeech
-    print('-----------------------------------------------------------')
-    print('Warming up ...')
-    warm_up_test(
-        audio_process_handler=file_to_transcript,
-        manifest_path=args.warmup_manifest,
-        num_test_cases=3)
-    print('-----------------------------------------------------------')
-
-    # start the server
-    server = AsrTCPServer(
-        server_address=(args.host_ip, args.host_port),
-        RequestHandlerClass=AsrRequestHandler,
-        speech_save_dir=args.speech_save_dir,
-        audio_process_handler=file_to_transcript)
-    print("ASR Server Started.")
-    server.serve_forever()
-
-
-def main(config, args):
-    start_server(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    add_arg = functools.partial(add_arguments, argparser=parser)
-    # yapf: disable
-    add_arg('host_ip',          str,
-            'localhost',
-            "Server's IP address.")
-    add_arg('host_port',        int,    8089,    "Server's IP port.")
-    add_arg('speech_save_dir',  str,
-            'demo_cache',
-            "Directory to save demo audios.")
-    add_arg('warmup_manifest',  str, None, "Filepath of manifest to warm up.")
-    add_arg(
-        "--model_file",
-        type=str,
-        default="",
-        help="Model filename, Specify this when your model is a combined model."
-    )
-    add_arg(
-        "--params_file",
-        type=str,
-        default="",
-        help="Parameter filename, Specify this when your model is a combined model."
-    )
-    add_arg(
-        "--model_dir",
-        type=str,
-        default=None,
-        help="Model dir, If you load a non-combined model, specify the directory of the model."
-    )
-    add_arg("--use_gpu",
-                        type=bool,
-                        default=False,
-                        help="Whether use gpu.")
-    args = parser.parse_args()
-    print_arguments(args, globals())
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-
-    args.warmup_manifest = config.data.test_manifest
-    print_arguments(args, globals())
-
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
--- a/deepspeech/exps/deepspeech2/bin/deploy/send.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/send.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Socket client to send wav to ASR server."""
-import argparse
-import wave
-
-from deepspeech.utils.socket_server import socket_send
-
-parser = argparse.ArgumentParser(description=__doc__)
-parser.add_argument(
-    "--host_ip",
-    default="localhost",
-    type=str,
-    help="Server IP address. (default: %(default)s)")
-parser.add_argument(
-    "--host_port",
-    default=8086,
-    type=int,
-    help="Server Port. (default: %(default)s)")
-args = parser.parse_args()
-
-WAVE_OUTPUT_FILENAME = "output.wav"
-
-
-def main():
-    wf = wave.open(WAVE_OUTPUT_FILENAME, 'rb')
-    nframe = wf.getnframes()
-    data = wf.readframes(nframe)
-    print(f"Wave: {WAVE_OUTPUT_FILENAME}")
-    print(f"Wave samples: {nframe}")
-    print(f"Wave channels: {wf.getnchannels()}")
-    print(f"Wave sample rate: {wf.getframerate()}")
-    print(f"Wave sample width: {wf.getsampwidth()}")
-    assert isinstance(data, bytes)
-    socket_send(args.host_ip, args.host_port, data)
-
-
-if __name__ == "__main__":
-    main()
--- a/deepspeech/exps/deepspeech2/bin/deploy/server.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Server-end for the ASR demo."""
-import functools
-
-import numpy as np
-import paddle
-from paddle.io import DataLoader
-
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.io.dataset import ManifestDataset
-from deepspeech.models.ds2 import DeepSpeech2Model
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils.socket_server import AsrRequestHandler
-from deepspeech.utils.socket_server import AsrTCPServer
-from deepspeech.utils.socket_server import warm_up_test
-from deepspeech.utils.utility import add_arguments
-from deepspeech.utils.utility import print_arguments
-
-
-def start_server(config, args):
-    """Start the ASR server"""
-    config.defrost()
-    config.data.manifest = config.data.test_manifest
-    dataset = ManifestDataset.from_config(config)
-
-    config.collator.augmentation_config = ""
-    config.collator.keep_transcription_text = True
-    config.collator.batch_size = 1
-    config.collator.num_workers = 0
-    collate_fn = SpeechCollator.from_config(config)
-    test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
-
-    model = DeepSpeech2Model.from_pretrained(test_loader, config,
-                                             args.checkpoint_path)
-    model.eval()
-
-    # prepare ASR inference handler
-    def file_to_transcript(filename):
-        feature = test_loader.collate_fn.process_utterance(filename, "")
-        audio = np.array([feature[0]]).astype('float32')  #[1, T, D]
-        # audio = audio.swapaxes(1,2)
-        print('---file_to_transcript feature----')
-        print(audio.shape)
-        audio_len = feature[0].shape[0]
-        print(audio_len)
-        audio_len = np.array([audio_len]).astype('int64')  # [1]
-
-        result_transcript = model.decode(
-            paddle.to_tensor(audio),
-            paddle.to_tensor(audio_len),
-            vocab_list=test_loader.collate_fn.vocab_list,
-            decoding_method=config.decoding.decoding_method,
-            lang_model_path=config.decoding.lang_model_path,
-            beam_alpha=config.decoding.alpha,
-            beam_beta=config.decoding.beta,
-            beam_size=config.decoding.beam_size,
-            cutoff_prob=config.decoding.cutoff_prob,
-            cutoff_top_n=config.decoding.cutoff_top_n,
-            num_processes=config.decoding.num_proc_bsearch)
-        return result_transcript[0]
-
-    # warming up with utterrances sampled from Librispeech
-    print('-----------------------------------------------------------')
-    print('Warming up ...')
-    warm_up_test(
-        audio_process_handler=file_to_transcript,
-        manifest_path=args.warmup_manifest,
-        num_test_cases=3)
-    print('-----------------------------------------------------------')
-
-    # start the server
-    server = AsrTCPServer(
-        server_address=(args.host_ip, args.host_port),
-        RequestHandlerClass=AsrRequestHandler,
-        speech_save_dir=args.speech_save_dir,
-        audio_process_handler=file_to_transcript)
-    print("ASR Server Started.")
-    server.serve_forever()
-
-
-def main(config, args):
-    start_server(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    add_arg = functools.partial(add_arguments, argparser=parser)
-    # yapf: disable
-    add_arg('host_ip',          str,
-            'localhost',
-            "Server's IP address.")
-    add_arg('host_port',        int,    8088,    "Server's IP port.")
-    add_arg('speech_save_dir',  str,
-            'demo_cache',
-            "Directory to save demo audios.")
-    add_arg('warmup_manifest', str, None, "Filepath of manifest to warm up.")
-    args = parser.parse_args()
-    print_arguments(args, globals())
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-
-    args.warmup_manifest = config.data.test_manifest
-    print_arguments(args, globals())
-
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
--- a/deepspeech/exps/deepspeech2/bin/export.py
+++ b/deepspeech/exps/deepspeech2/bin/export.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Export for DeepSpeech2 model."""
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils.utility import print_arguments
-
-
-def main_sp(config, args):
-    exp = Tester(config, args)
-    with exp.eval():
-        exp.setup()
-        exp.run_export()
-
-
-def main(config, args):
-    main_sp(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    # save jit model to 
-    parser.add_argument(
-        "--export_path", type=str, help="path of the jit model to save")
-    parser.add_argument(
-        "--model_type", type=str, default='offline', help="offline/online")
-    args = parser.parse_args()
-    print("model_type:{}".format(args.model_type))
-    print_arguments(args)
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
--- a/deepspeech/exps/deepspeech2/bin/test.py
+++ b/deepspeech/exps/deepspeech2/bin/test.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Evaluation for DeepSpeech2 model."""
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester as Tester
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils.utility import print_arguments
-
-
-def main_sp(config, args):
-    exp = Tester(config, args)
-    with exp.eval():
-        exp.setup()
-        exp.run_test()
-
-
-def main(config, args):
-    main_sp(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    parser.add_argument(
-        "--model_type", type=str, default='offline', help='offline/online')
-    # save asr result to 
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    args = parser.parse_args()
-    print_arguments(args, globals())
-    print("model_type:{}".format(args.model_type))
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
--- a/deepspeech/exps/deepspeech2/bin/test_export.py
+++ b/deepspeech/exps/deepspeech2/bin/test_export.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Evaluation for DeepSpeech2 model."""
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils.utility import print_arguments
-
-
-def main_sp(config, args):
-    exp = ExportTester(config, args)
-    with exp.eval():
-        exp.setup()
-        exp.run_test()
-
-
-def main(config, args):
-    main_sp(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    #load jit model from
-    parser.add_argument(
-        "--export_path", type=str, help="path of the jit model to save")
-    parser.add_argument(
-        "--model_type", type=str, default='offline', help='offline/online')
-    args = parser.parse_args()
-    print_arguments(args, globals())
-    print("model_type:{}".format(args.model_type))
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
--- a/deepspeech/exps/deepspeech2/bin/test_hub.py
+++ b/deepspeech/exps/deepspeech2/bin/test_hub.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Evaluation for DeepSpeech2 model."""
-import os
-import sys
-from pathlib import Path
-
-import paddle
-import soundfile
-
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.models.ds2 import DeepSpeech2Model
-from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils import mp_tools
-from deepspeech.utils.checkpoint import Checkpoint
-from deepspeech.utils.log import Log
-from deepspeech.utils.utility import print_arguments
-from deepspeech.utils.utility import UpdateConfig
-
-logger = Log(__name__).getlog()
-
-
-class DeepSpeech2Tester_hub():
-    def __init__(self, config, args):
-        self.args = args
-        self.config = config
-        self.audio_file = args.audio_file
-        self.collate_fn_test = SpeechCollator.from_config(config)
-        self._text_featurizer = TextFeaturizer(
-            unit_type=config.collator.unit_type, vocab_filepath=None)
-
-    def compute_result_transcripts(self, audio, audio_len, vocab_list, cfg):
-        result_transcripts = self.model.decode(
-            audio,
-            audio_len,
-            vocab_list,
-            decoding_method=cfg.decoding_method,
-            lang_model_path=cfg.lang_model_path,
-            beam_alpha=cfg.alpha,
-            beam_beta=cfg.beta,
-            beam_size=cfg.beam_size,
-            cutoff_prob=cfg.cutoff_prob,
-            cutoff_top_n=cfg.cutoff_top_n,
-            num_processes=cfg.num_proc_bsearch)
-
-        return result_transcripts
-
-    @mp_tools.rank_zero_only
-    @paddle.no_grad()
-    def test(self):
-        self.model.eval()
-        cfg = self.config
-        audio_file = self.audio_file
-        collate_fn_test = self.collate_fn_test
-        audio, _ = collate_fn_test.process_utterance(
-            audio_file=audio_file, transcript=" ")
-        audio_len = audio.shape[0]
-        audio = paddle.to_tensor(audio, dtype='float32')
-        audio_len = paddle.to_tensor(audio_len)
-        audio = paddle.unsqueeze(audio, axis=0)
-        vocab_list = collate_fn_test.vocab_list
-        result_transcripts = self.compute_result_transcripts(
-            audio, audio_len, vocab_list, cfg.decoding)
-        logger.info("result_transcripts: " + result_transcripts[0])
-
-    def run_test(self):
-        self.resume()
-        try:
-            self.test()
-        except KeyboardInterrupt:
-            exit(-1)
-
-    def setup(self):
-        """Setup the experiment.
-        """
-        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
-
-        self.setup_output_dir()
-        self.setup_checkpointer()
-
-        self.setup_model()
-
-    def setup_output_dir(self):
-        """Create a directory used for output.
-        """
-        # output dir
-        if self.args.output:
-            output_dir = Path(self.args.output).expanduser()
-            output_dir.mkdir(parents=True, exist_ok=True)
-        else:
-            output_dir = Path(
-                self.args.checkpoint_path).expanduser().parent.parent
-            output_dir.mkdir(parents=True, exist_ok=True)
-        self.output_dir = output_dir
-
-    def setup_model(self):
-        config = self.config.clone()
-        with UpdateConfig(config):
-            config.model.feat_size = self.collate_fn_test.feature_size
-            config.model.dict_size = self.collate_fn_test.vocab_size
-
-        if self.args.model_type == 'offline':
-            model = DeepSpeech2Model.from_config(config.model)
-        elif self.args.model_type == 'online':
-            model = DeepSpeech2ModelOnline.from_config(config.model)
-        else:
-            raise Exception("wrong model type")
-
-        self.model = model
-
-    def setup_checkpointer(self):
-        """Create a directory used to save checkpoints into.
-
-        It is "checkpoints" inside the output directory.
-        """
-        # checkpoint dir
-        checkpoint_dir = self.output_dir / "checkpoints"
-        checkpoint_dir.mkdir(exist_ok=True)
-
-        self.checkpoint_dir = checkpoint_dir
-
-        self.checkpoint = Checkpoint(
-            kbest_n=self.config.training.checkpoint.kbest_n,
-            latest_n=self.config.training.checkpoint.latest_n)
-
-    def resume(self):
-        """Resume from the checkpoint at checkpoints in the output
-        directory or load a specified checkpoint.
-        """
-        params_path = self.args.checkpoint_path + ".pdparams"
-        model_dict = paddle.load(params_path)
-        self.model.set_state_dict(model_dict)
-
-
-def check(audio_file):
-    logger.info("checking the audio file format......")
-    try:
-        sig, sample_rate = soundfile.read(audio_file)
-    except Exception as e:
-        logger.error(str(e))
-        logger.error(
-            "can not open the wav file, please check the audio file format")
-        sys.exit(-1)
-    logger.info("The sample rate is %d" % sample_rate)
-    assert (sample_rate == 16000)
-    logger.info("The audio file format is right")
-
-
-def main_sp(config, args):
-    exp = DeepSpeech2Tester_hub(config, args)
-    exp.setup()
-    exp.run_test()
-
-
-def main(config, args):
-    main_sp(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    parser.add_argument(
-        "--model_type", type=str, default='offline', help='offline/online')
-    parser.add_argument("--audio_file", type=str, help='audio file path')
-    # save asr result to
-    parser.add_argument(
-        "--result_file", type=str, help="path of save the asr result")
-    args = parser.parse_args()
-    print_arguments(args, globals())
-    if not os.path.isfile(args.audio_file):
-        print("Please input the audio file path")
-        sys.exit(-1)
-    check(args.audio_file)
-    print("model_type:{}".format(args.model_type))
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
--- a/deepspeech/exps/deepspeech2/bin/train.py
+++ b/deepspeech/exps/deepspeech2/bin/train.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Trainer for DeepSpeech2 model."""
-from paddle import distributed as dist
-
-from deepspeech.exps.deepspeech2.config import get_cfg_defaults
-from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
-from deepspeech.training.cli import default_argument_parser
-from deepspeech.utils.utility import print_arguments
-
-
-def main_sp(config, args):
-    exp = Trainer(config, args)
-    exp.setup()
-    exp.run()
-
-
-def main(config, args):
-    if args.nprocs > 0:
-        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
-    else:
-        main_sp(config, args)
-
-
-if __name__ == "__main__":
-    parser = default_argument_parser()
-    parser.add_argument(
-        "--model_type", type=str, default='offline', help='offline/online')
-    args = parser.parse_args()
-    print("model_type:{}".format(args.model_type))
-    print_arguments(args, globals())
-
-    # https://yaml.org/type/float.html
-    config = get_cfg_defaults(args.model_type)
-    if args.config:
-        config.merge_from_file(args.config)
-    if args.opts:
-        config.merge_from_list(args.opts)
-    config.freeze()
-    print(config)
-    if args.dump_config:
-        with open(args.dump_config, 'w') as f:
-            print(config, file=f)
-
-    main(config, args)
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from yacs.config import CfgNode
-
-from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester
-from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.io.dataset import ManifestDataset
-from deepspeech.models.ds2 import DeepSpeech2Model
-from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
-
-
-def get_cfg_defaults(model_type='offline'):
-    _C = CfgNode()
-    _C.data = ManifestDataset.params()
-    _C.collator = SpeechCollator.params()
-    _C.training = DeepSpeech2Trainer.params()
-    _C.decoding = DeepSpeech2Tester.params()
-    if model_type == 'offline':
-        _C.model = DeepSpeech2Model.params()
-    else:
-        _C.model = DeepSpeech2ModelOnline.params()
-    """Get a yacs CfgNode object with default values for my_project."""
-    # Return a clone so that the defaults will not be altered
-    # This is for the "local variable" use pattern
-    config = _C.clone()
-    config.set_new_allowed(True)
-    return config
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
--- a/deepspeech/exps/lm/transformer/bin/cacu_perplexity.py
+++ b/deepspeech/exps/lm/transformer/bin/cacu_perplexity.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import sys
-
-import configargparse
-
-
-def get_parser():
-    """Get default arguments."""
-    parser = configargparse.ArgumentParser(
-        description="The parser for caculating the perplexity of transformer language model ",
-        config_file_parser_class=configargparse.YAMLConfigFileParser,
-        formatter_class=configargparse.ArgumentDefaultsHelpFormatter, )
-
-    parser.add_argument(
-        "--rnnlm", type=str, default=None, help="RNNLM model file to read")
-
-    parser.add_argument(
-        "--rnnlm-conf",
-        type=str,
-        default=None,
-        help="RNNLM model config file to read")
-
-    parser.add_argument(
-        "--vocab_path",
-        type=str,
-        default=None,
-        help="vocab path to for token2id")
-
-    parser.add_argument(
-        "--bpeprefix",
-        type=str,
-        default=None,
-        help="The path of bpeprefix for loading")
-
-    parser.add_argument(
-        "--text_path",
-        type=str,
-        default=None,
-        help="The path of text file for testing ")
-
-    parser.add_argument(
-        "--ngpu",
-        type=int,
-        default=0,
-        help="The number of gpu to use, 0 for using cpu instead")
-
-    parser.add_argument(
-        "--dtype",
-        choices=("float16", "float32", "float64"),
-        default="float32",
-        help="Float precision (only available in --api v2)", )
-
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        default=".",
-        help="The output directory to store the sentence PPL")
-
-    return parser
-
-
-def main(args):
-    parser = get_parser()
-    args = parser.parse_args(args)
-    from deepspeech.exps.lm.transformer.lm_cacu_perplexity import run_get_perplexity
-    run_get_perplexity(args)
-
-
-if __name__ == "__main__":
-    main(sys.argv[1:])
--- a/deepspeech/exps/lm/transformer/lm_cacu_perplexity.py
+++ b/deepspeech/exps/lm/transformer/lm_cacu_perplexity.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# Caculating the PPL of LM model
-import os
-
-import numpy as np
-import paddle
-from paddle.io import DataLoader
-from yacs.config import CfgNode
-
-from deepspeech.io.collator import TextCollatorSpm
-from deepspeech.io.dataset import TextDataset
-from deepspeech.models.lm_interface import dynamic_import_lm
-from deepspeech.utils.log import Log
-
-logger = Log(__name__).getlog()
-
-
-def get_config(config_path):
-    confs = CfgNode(new_allowed=True)
-    confs.merge_from_file(config_path)
-    return confs
-
-
-def load_trained_lm(args):
-    lm_config = get_config(args.rnnlm_conf)
-    lm_model_module = lm_config.model_module
-    lm_class = dynamic_import_lm(lm_model_module)
-    lm = lm_class(**lm_config.model)
-    model_dict = paddle.load(args.rnnlm)
-    lm.set_state_dict(model_dict)
-    return lm, lm_config
-
-
-def write_dict_into_file(ppl_dict, name):
-    with open(name, "w") as f:
-        for key in ppl_dict.keys():
-            f.write(key + " " + ppl_dict[key] + "\n")
-    return
-
-
-def cacu_perplexity(
-        lm_model,
-        lm_config,
-        args,
-        log_base=None, ):
-    unit_type = lm_config.data.unit_type
-    batch_size = lm_config.decoding.batch_size
-    num_workers = lm_config.decoding.num_workers
-    text_file_path = args.text_path
-
-    total_nll = 0.0
-    total_ntokens = 0
-    ppl_dict = {}
-    len_dict = {}
-    text_dataset = TextDataset.from_file(text_file_path)
-    collate_fn_text = TextCollatorSpm(
-        unit_type=unit_type,
-        vocab_filepath=args.vocab_path,
-        spm_model_prefix=args.bpeprefix)
-    train_loader = DataLoader(
-        text_dataset,
-        batch_size=batch_size,
-        collate_fn=collate_fn_text,
-        num_workers=num_workers)
-
-    logger.info("start caculating PPL......")
-    for i, (keys, ys_input_pad, ys_output_pad,
-            y_lens) in enumerate(train_loader()):
-
-        ys_input_pad = paddle.to_tensor(ys_input_pad)
-        ys_output_pad = paddle.to_tensor(ys_output_pad)
-        _, unused_logp, unused_count, nll, nll_count = lm_model.forward(
-            ys_input_pad, ys_output_pad)
-        nll = nll.numpy()
-        nll_count = nll_count.numpy()
-        for key, _nll, ntoken in zip(keys, nll, nll_count):
-            if log_base is None:
-                utt_ppl = np.exp(_nll / ntoken)
-            else:
-                utt_ppl = log_base**(_nll / ntoken / np.log(log_base))
-
-            # Write PPL of each utts for debugging or analysis
-            ppl_dict[key] = str(utt_ppl)
-            len_dict[key] = str(ntoken)
-
-        total_nll += nll.sum()
-        total_ntokens += nll_count.sum()
-        logger.info("Current total nll: " + str(total_nll))
-        logger.info("Current total tokens: " + str(total_ntokens))
-    write_dict_into_file(ppl_dict, os.path.join(args.output_dir, "uttPPL"))
-    write_dict_into_file(len_dict, os.path.join(args.output_dir, "uttLEN"))
-    if log_base is None:
-        ppl = np.exp(total_nll / total_ntokens)
-    else:
-        ppl = log_base**(total_nll / total_ntokens / np.log(log_base))
-
-    if log_base is None:
-        log_base = np.e
-    else:
-        log_base = log_base
-
-    return ppl, log_base
-
-
-def run_get_perplexity(args):
-    if args.ngpu > 1:
-        raise NotImplementedError("only single GPU decoding is supported")
-    if args.ngpu == 1:
-        device = "gpu:0"
-    else:
-        device = "cpu"
-    paddle.set_device(device)
-    dtype = getattr(paddle, args.dtype)
-    logger.info(f"Decoding device={device}, dtype={dtype}")
-    lm_model, lm_config = load_trained_lm(args)
-    lm_model.to(device=device, dtype=dtype)
-    lm_model.eval()
-    PPL, log_base = cacu_perplexity(lm_model, lm_config, args, None)
-    logger.info("Final PPL: " + str(PPL))
-    logger.info("The log base is:" + str("%.2f" % log_base))
--- a/deepspeech/exps/u2/bin/alignment.py
+++ b/deepspeech/exps/u2/bin/alignment.py
--- a/deepspeech/exps/u2/bin/export.py
+++ b/deepspeech/exps/u2/bin/export.py
--- a/deepspeech/exps/u2/bin/test.py
+++ b/deepspeech/exps/u2/bin/test.py
--- a/deepspeech/exps/u2/bin/test_hub.py
+++ b/deepspeech/exps/u2/bin/test_hub.py
--- a/deepspeech/exps/u2/bin/train.py
+++ b/deepspeech/exps/u2/bin/train.py
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
--- a/deepspeech/exps/u2/trainer.py
+++ b/deepspeech/exps/u2/trainer.py
--- a/deepspeech/exps/u2_kaldi/bin/recog.py
+++ b/deepspeech/exps/u2_kaldi/bin/recog.py
--- a/deepspeech/exps/u2_kaldi/bin/test.py
+++ b/deepspeech/exps/u2_kaldi/bin/test.py
--- a/deepspeech/exps/u2_kaldi/bin/train.py
+++ b/deepspeech/exps/u2_kaldi/bin/train.py
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
--- a/deepspeech/exps/u2_st/bin/export.py
+++ b/deepspeech/exps/u2_st/bin/export.py
--- a/deepspeech/exps/u2_st/bin/test.py
+++ b/deepspeech/exps/u2_st/bin/test.py
--- a/deepspeech/exps/u2_st/bin/train.py
+++ b/deepspeech/exps/u2_st/bin/train.py
--- a/deepspeech/exps/u2_st/config.py
+++ b/deepspeech/exps/u2_st/config.py
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
--- a/deepspeech/frontend/augmentor/impulse_response.py
+++ b/deepspeech/frontend/augmentor/impulse_response.py
--- a/deepspeech/frontend/augmentor/noise_perturb.py
+++ b/deepspeech/frontend/augmentor/noise_perturb.py
--- a/deepspeech/frontend/augmentor/online_bayesian_normalization.py
+++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py
--- a/deepspeech/frontend/augmentor/resample.py
+++ b/deepspeech/frontend/augmentor/resample.py
--- a/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/deepspeech/frontend/augmentor/shift_perturb.py
--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
--- a/deepspeech/frontend/augmentor/volume_perturb.py
+++ b/deepspeech/frontend/augmentor/volume_perturb.py
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
--- a/deepspeech/frontend/speech.py
+++ b/deepspeech/frontend/speech.py
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
--- a/deepspeech/io/converter.py
+++ b/deepspeech/io/converter.py
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
--- a/deepspeech/io/sampler.py
+++ b/deepspeech/io/sampler.py
--- a/deepspeech/io/utility.py
+++ b/deepspeech/io/utility.py
--- a/deepspeech/models/asr_interface.py
+++ b/deepspeech/models/asr_interface.py
--- a/deepspeech/models/ds2/conv.py
+++ b/deepspeech/models/ds2/conv.py
--- a/deepspeech/models/ds2/deepspeech2.py
+++ b/deepspeech/models/ds2/deepspeech2.py
--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
--- a/deepspeech/models/ds2_online/conv.py
+++ b/deepspeech/models/ds2_online/conv.py
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
--- a/deepspeech/models/lm/transformer.py
+++ b/deepspeech/models/lm/transformer.py
--- a/deepspeech/models/lm_interface.py
+++ b/deepspeech/models/lm_interface.py
--- a/deepspeech/models/st_interface.py
+++ b/deepspeech/models/st_interface.py
--- a/deepspeech/models/u2/u2.py
+++ b/deepspeech/models/u2/u2.py
--- a/deepspeech/models/u2/updater.py
+++ b/deepspeech/models/u2/updater.py
--- a/deepspeech/models/u2_st/u2_st.py
+++ b/deepspeech/models/u2_st/u2_st.py
--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
--- a/deepspeech/modules/attention.py
+++ b/deepspeech/modules/attention.py
--- a/deepspeech/modules/cmvn.py
+++ b/deepspeech/modules/cmvn.py
--- a/deepspeech/modules/conformer_convolution.py
+++ b/deepspeech/modules/conformer_convolution.py
--- a/deepspeech/modules/crf.py
+++ b/deepspeech/modules/crf.py
--- a/deepspeech/modules/ctc.py
+++ b/deepspeech/modules/ctc.py
--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
--- a/deepspeech/modules/decoder_layer.py
+++ b/deepspeech/modules/decoder_layer.py
--- a/deepspeech/modules/embedding.py
+++ b/deepspeech/modules/embedding.py
--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
--- a/deepspeech/modules/encoder_layer.py
+++ b/deepspeech/modules/encoder_layer.py
--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
--- a/deepspeech/modules/mask.py
+++ b/deepspeech/modules/mask.py
--- a/deepspeech/modules/positionwise_feed_forward.py
+++ b/deepspeech/modules/positionwise_feed_forward.py
--- a/deepspeech/modules/subsampling.py
+++ b/deepspeech/modules/subsampling.py
--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
--- a/deepspeech/training/extensions/evaluator.py
+++ b/deepspeech/training/extensions/evaluator.py
--- a/deepspeech/training/extensions/snapshot.py
+++ b/deepspeech/training/extensions/snapshot.py
--- a/deepspeech/training/gradclip.py
+++ b/deepspeech/training/gradclip.py
--- a/deepspeech/training/optimizer.py
+++ b/deepspeech/training/optimizer.py
--- a/deepspeech/training/scheduler.py
+++ b/deepspeech/training/scheduler.py
--- a/deepspeech/training/timer.py
+++ b/deepspeech/training/timer.py
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
--- a/deepspeech/training/updaters/standard_updater.py
+++ b/deepspeech/training/updaters/standard_updater.py
--- a/deepspeech/training/updaters/trainer.py
+++ b/deepspeech/training/updaters/trainer.py
--- a/deepspeech/training/updaters/updater.py
+++ b/deepspeech/training/updaters/updater.py
--- a/deepspeech/transform/functional.py
+++ b/deepspeech/transform/functional.py
--- a/deepspeech/transform/perturb.py
+++ b/deepspeech/transform/perturb.py
--- a/deepspeech/transform/spec_augment.py
+++ b/deepspeech/transform/spec_augment.py
--- a/deepspeech/transform/transformation.py
+++ b/deepspeech/transform/transformation.py
--- a/deepspeech/utils/checkpoint.py
+++ b/deepspeech/utils/checkpoint.py
--- a/deepspeech/utils/cli_readers.py
+++ b/deepspeech/utils/cli_readers.py
--- a/deepspeech/utils/cli_writers.py
+++ b/deepspeech/utils/cli_writers.py
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
--- a/deepspeech/utils/dynamic_import.py
+++ b/deepspeech/utils/dynamic_import.py
--- a/deepspeech/utils/profiler.py
+++ b/deepspeech/utils/profiler.py
--- a/deepspeech/utils/socket_server.py
+++ b/deepspeech/utils/socket_server.py
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
--- a/docs/make.bat
+++ b/docs/make.bat
--- a/docs/source/asr/models_introduction.md
+++ b/docs/source/asr/models_introduction.md
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
--- a/docs/source/install.md
+++ b/docs/source/install.md
--- a/docs/source/tts/advanced_usage.md
+++ b/docs/source/tts/advanced_usage.md
--- a/docs/source/tts/quick_start.md
+++ b/docs/source/tts/quick_start.md
--- a/examples/aishell/s0/path.sh
+++ b/examples/aishell/s0/path.sh
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
--- a/examples/aishell3/tts3/path.sh
+++ b/examples/aishell3/tts3/path.sh
--- a/examples/aishell3/vc0/README.md
+++ b/examples/aishell3/vc0/README.md
--- a/examples/aishell3/vc0/path.sh
+++ b/examples/aishell3/vc0/path.sh
--- a/examples/callcenter/s1/path.sh
+++ b/examples/callcenter/s1/path.sh
--- a/examples/csmsc/tts2/path.sh
+++ b/examples/csmsc/tts2/path.sh
--- a/examples/csmsc/tts3/path.sh
+++ b/examples/csmsc/tts3/path.sh
--- a/examples/csmsc/voc1/path.sh
+++ b/examples/csmsc/voc1/path.sh
--- a/examples/csmsc/voc3/path.sh
+++ b/examples/csmsc/voc3/path.sh
--- a/examples/librispeech/s0/path.sh
+++ b/examples/librispeech/s0/path.sh
--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
--- a/examples/ljspeech/tts0/path.sh
+++ b/examples/ljspeech/tts0/path.sh
--- a/examples/ljspeech/tts1/path.sh
+++ b/examples/ljspeech/tts1/path.sh
--- a/examples/ljspeech/tts3/path.sh
+++ b/examples/ljspeech/tts3/path.sh
--- a/examples/ljspeech/voc0/path.sh
+++ b/examples/ljspeech/voc0/path.sh
--- a/examples/ljspeech/voc1/path.sh
+++ b/examples/ljspeech/voc1/path.sh
--- a/examples/other/1xt2x/src_deepspeech2x/__init__.py
+++ b/examples/other/1xt2x/src_deepspeech2x/__init__.py
--- a/examples/other/1xt2x/src_deepspeech2x/bin/test.py
+++ b/examples/other/1xt2x/src_deepspeech2x/bin/test.py
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
+++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/deepspeech2.py
--- a/examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py
+++ b/examples/other/1xt2x/src_deepspeech2x/models/ds2/rnn.py
--- a/examples/other/1xt2x/src_deepspeech2x/test_model.py
+++ b/examples/other/1xt2x/src_deepspeech2x/test_model.py
--- a/examples/other/g2p/.gitignore
+++ b/examples/other/g2p/.gitignore
--- a/examples/other/g2p/README.md
+++ b/examples/other/g2p/README.md
--- a/examples/other/g2p/zh/README.md
+++ b/examples/other/g2p/zh/README.md
--- a/examples/other/g2p/zh/local/convert_transcription.py
+++ b/examples/other/g2p/zh/local/convert_transcription.py
--- a/examples/other/g2p/zh/local/extract_pinyin_label.py
+++ b/examples/other/g2p/zh/local/extract_pinyin_label.py
--- a/examples/other/g2p/zh/local/ignore_sandhi.py
+++ b/examples/other/g2p/zh/local/ignore_sandhi.py
--- a/examples/other/g2p/zh/local/prepare_dataset.sh
+++ b/examples/other/g2p/zh/local/prepare_dataset.sh
--- a/examples/other/g2p/zh/path.sh
+++ b/examples/other/g2p/zh/path.sh
--- a/examples/other/g2p/zh/requirements.txt
+++ b/examples/other/g2p/zh/requirements.txt
--- a/examples/other/g2p/zh/run.sh
+++ b/examples/other/g2p/zh/run.sh
--- a/examples/other/ge2e/path.sh
+++ b/examples/other/ge2e/path.sh
--- a/examples/other/text_frontend/test_g2p.py
+++ b/examples/other/text_frontend/test_g2p.py
--- a/examples/other/text_frontend/test_textnorm.py
+++ b/examples/other/text_frontend/test_textnorm.py
--- a/examples/other/tn/.gitignore
+++ b/examples/other/tn/.gitignore
--- a/examples/other/tn/README.md
+++ b/examples/other/tn/README.md
--- a/examples/other/tn/data/sentences.txt
+++ b/examples/other/tn/data/sentences.txt
--- a/examples/other/tn/local/test_normalization.py
+++ b/examples/other/tn/local/test_normalization.py
--- a/examples/other/tn/path.sh
+++ b/examples/other/tn/path.sh
--- a/examples/other/tn/run.sh
+++ b/examples/other/tn/run.sh
--- a/examples/ted_en_zh/t0/path.sh
+++ b/examples/ted_en_zh/t0/path.sh
--- a/examples/timit/s1/path.sh
+++ b/examples/timit/s1/path.sh
--- a/examples/tiny/s0/path.sh
+++ b/examples/tiny/s0/path.sh
--- a/examples/tiny/s1/path.sh
+++ b/examples/tiny/s1/path.sh
--- a/examples/vctk/tts3/path.sh
+++ b/examples/vctk/tts3/path.sh
--- a/examples/vctk/voc1/path.sh
+++ b/examples/vctk/voc1/path.sh
--- a/hub/requirements.txt
+++ b/hub/requirements.txt
--- a/hub/setup_hub.sh
+++ b/hub/setup_hub.sh
--- a/paddleaudio/examples/panns/audio_tag.py
+++ b/paddleaudio/examples/panns/audio_tag.py
--- a/paddleaudio/examples/panns/parse_result.py
+++ b/paddleaudio/examples/panns/parse_result.py
--- a/paddleaudio/examples/sound_classification/deploy/python/predict.py
+++ b/paddleaudio/examples/sound_classification/deploy/python/predict.py
--- a/paddleaudio/examples/sound_classification/export_model.py
+++ b/paddleaudio/examples/sound_classification/export_model.py
--- a/paddleaudio/examples/sound_classification/predict.py
+++ b/paddleaudio/examples/sound_classification/predict.py
--- a/paddleaudio/examples/sound_classification/train.py
+++ b/paddleaudio/examples/sound_classification/train.py
--- a/paddleaudio/paddleaudio/features/augment.py
+++ b/paddleaudio/paddleaudio/features/augment.py
--- a/paddleaudio/paddleaudio/features/core.py
+++ b/paddleaudio/paddleaudio/features/core.py
--- a/paddleaudio/requirements.txt
+++ b/paddleaudio/requirements.txt
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
--- a/paddleaudio/test/unit_test/test_backend.py
+++ b/paddleaudio/test/unit_test/test_backend.py
--- a/paddleaudio/test/unit_test/test_features.py
+++ b/paddleaudio/test/unit_test/test_features.py
--- a/deepspeech/decoders/ctcdecoder/swig/__init__.py
+++ b/deepspeech/decoders/ctcdecoder/swig/__init__.py
--- a/paddlespeech/s2t/__init__.py
+++ b/paddlespeech/s2t/__init__.py
--- a/deepspeech/decoders/README.md
+++ b/deepspeech/decoders/README.md
--- a/deepspeech/decoders/__init__.py
+++ b/deepspeech/decoders/__init__.py
--- a/deepspeech/decoders/beam_search/__init__.py
+++ b/deepspeech/decoders/beam_search/__init__.py
--- a/deepspeech/decoders/beam_search/batch_beam_search.py
+++ b/deepspeech/decoders/beam_search/batch_beam_search.py
--- a/paddlespeech/s2t/decoders/beam_search/beam_search.py
+++ b/paddlespeech/s2t/decoders/beam_search/beam_search.py
--- a/deepspeech/decoders/scorers/__init__.py
+++ b/deepspeech/decoders/scorers/__init__.py
--- a/deepspeech/decoders/ctcdecoder/decoders_deprecated.py
+++ b/deepspeech/decoders/ctcdecoder/decoders_deprecated.py
--- a/deepspeech/decoders/ctcdecoder/scorer_deprecated.py
+++ b/deepspeech/decoders/ctcdecoder/scorer_deprecated.py
--- a/deepspeech/decoders/ctcdecoder/swig/.gitignore
+++ b/deepspeech/decoders/ctcdecoder/swig/.gitignore
--- a/deepspeech/exps/deepspeech2/__init__.py
+++ b/deepspeech/exps/deepspeech2/__init__.py
--- a/deepspeech/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/ctc_beam_search_decoder.cpp
--- a/deepspeech/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h
+++ b/deepspeech/decoders/ctcdecoder/swig/ctc_beam_search_decoder.h
--- a/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.cpp
--- a/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.h
+++ b/deepspeech/decoders/ctcdecoder/swig/ctc_greedy_decoder.h
--- a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.cpp
--- a/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h
+++ b/deepspeech/decoders/ctcdecoder/swig/decoder_utils.h
--- a/deepspeech/decoders/ctcdecoder/swig/decoders.i
+++ b/deepspeech/decoders/ctcdecoder/swig/decoders.i
--- a/deepspeech/decoders/ctcdecoder/swig/path_trie.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/path_trie.cpp
--- a/deepspeech/decoders/ctcdecoder/swig/path_trie.h
+++ b/deepspeech/decoders/ctcdecoder/swig/path_trie.h
--- a/deepspeech/decoders/ctcdecoder/swig/scorer.cpp
+++ b/deepspeech/decoders/ctcdecoder/swig/scorer.cpp
--- a/deepspeech/decoders/ctcdecoder/swig/scorer.h
+++ b/deepspeech/decoders/ctcdecoder/swig/scorer.h
--- a/deepspeech/decoders/ctcdecoder/swig/setup.py
+++ b/deepspeech/decoders/ctcdecoder/swig/setup.py
--- a/deepspeech/decoders/ctcdecoder/swig/setup.sh
+++ b/deepspeech/decoders/ctcdecoder/swig/setup.sh
--- a/deepspeech/decoders/ctcdecoder/swig_wrapper.py
+++ b/deepspeech/decoders/ctcdecoder/swig_wrapper.py
--- a/paddlespeech/s2t/decoders/ctcdecoder/tests/test_decoders.py
+++ b/paddlespeech/s2t/decoders/ctcdecoder/tests/test_decoders.py
--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
--- a/paddlespeech/s2t/decoders/recog_bin.py
+++ b/paddlespeech/s2t/decoders/recog_bin.py
--- a/deepspeech/exps/lm/transformer/__init__.py
+++ b/deepspeech/exps/lm/transformer/__init__.py
--- a/paddlespeech/s2t/decoders/scorers/ctc.py
+++ b/paddlespeech/s2t/decoders/scorers/ctc.py
--- a/deepspeech/decoders/scorers/ctc_prefix_score.py
+++ b/deepspeech/decoders/scorers/ctc_prefix_score.py
--- a/deepspeech/decoders/scorers/length_bonus.py
+++ b/deepspeech/decoders/scorers/length_bonus.py
--- a/deepspeech/decoders/scorers/ngram.py
+++ b/deepspeech/decoders/scorers/ngram.py
--- a/deepspeech/decoders/scorers/scorer_interface.py
+++ b/deepspeech/decoders/scorers/scorer_interface.py
--- a/paddlespeech/s2t/decoders/utils.py
+++ b/paddlespeech/s2t/decoders/utils.py
--- a/paddlespeech/s2t/exps/__init__.py
+++ b/paddlespeech/s2t/exps/__init__.py
--- a/deepspeech/exps/u2/__init__.py
+++ b/deepspeech/exps/u2/__init__.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/client.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/client.py
--- a/deepspeech/exps/deepspeech2/bin/deploy/record.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/record.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/runtime.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/send.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/send.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/deploy/server.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/export.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_export.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/test_hub.py
--- a/paddlespeech/s2t/exps/deepspeech2/bin/train.py
+++ b/paddlespeech/s2t/exps/deepspeech2/bin/train.py
--- a/paddlespeech/s2t/exps/deepspeech2/config.py
+++ b/paddlespeech/s2t/exps/deepspeech2/config.py
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
--- a/deepspeech/exps/u2_kaldi/__init__.py
+++ b/deepspeech/exps/u2_kaldi/__init__.py
--- a/paddlespeech/s2t/exps/lm/transformer/bin/cacu_perplexity.py
+++ b/paddlespeech/s2t/exps/lm/transformer/bin/cacu_perplexity.py
--- a/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py
+++ b/paddlespeech/s2t/exps/lm/transformer/lm_cacu_perplexity.py
--- a/deepspeech/exps/u2_st/__init__.py
+++ b/deepspeech/exps/u2_st/__init__.py
--- a/paddlespeech/s2t/exps/u2/bin/alignment.py
+++ b/paddlespeech/s2t/exps/u2/bin/alignment.py
--- a/paddlespeech/s2t/exps/u2/bin/export.py
+++ b/paddlespeech/s2t/exps/u2/bin/export.py
--- a/paddlespeech/s2t/exps/u2/bin/test.py
+++ b/paddlespeech/s2t/exps/u2/bin/test.py
--- a/paddlespeech/s2t/exps/u2/bin/test_hub.py
+++ b/paddlespeech/s2t/exps/u2/bin/test_hub.py
--- a/paddlespeech/s2t/exps/u2/bin/train.py
+++ b/paddlespeech/s2t/exps/u2/bin/train.py
--- a/paddlespeech/s2t/exps/u2/config.py
+++ b/paddlespeech/s2t/exps/u2/config.py
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
--- a/paddlespeech/s2t/exps/u2/trainer.py
+++ b/paddlespeech/s2t/exps/u2/trainer.py
--- a/deepspeech/frontend/__init__.py
+++ b/deepspeech/frontend/__init__.py
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/recog.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/recog.py
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/test.py
--- a/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/bin/train.py
--- a/paddlespeech/s2t/exps/u2_kaldi/model.py
+++ b/paddlespeech/s2t/exps/u2_kaldi/model.py
--- a/deepspeech/frontend/augmentor/__init__.py
+++ b/deepspeech/frontend/augmentor/__init__.py
--- a/paddlespeech/s2t/exps/u2_st/bin/export.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/export.py
--- a/paddlespeech/s2t/exps/u2_st/bin/test.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/test.py
--- a/paddlespeech/s2t/exps/u2_st/bin/train.py
+++ b/paddlespeech/s2t/exps/u2_st/bin/train.py
--- a/paddlespeech/s2t/exps/u2_st/config.py
+++ b/paddlespeech/s2t/exps/u2_st/config.py
--- a/paddlespeech/s2t/exps/u2_st/model.py
+++ b/paddlespeech/s2t/exps/u2_st/model.py
--- a/deepspeech/io/__init__.py
+++ b/deepspeech/io/__init__.py
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
--- a/deepspeech/models/__init__.py
+++ b/deepspeech/models/__init__.py
--- a/paddlespeech/s2t/frontend/augmentor/augmentation.py
+++ b/paddlespeech/s2t/frontend/augmentor/augmentation.py
--- a/deepspeech/frontend/augmentor/base.py
+++ b/deepspeech/frontend/augmentor/base.py
--- a/paddlespeech/s2t/frontend/augmentor/impulse_response.py
+++ b/paddlespeech/s2t/frontend/augmentor/impulse_response.py
--- a/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/noise_perturb.py
--- a/paddlespeech/s2t/frontend/augmentor/online_bayesian_normalization.py
+++ b/paddlespeech/s2t/frontend/augmentor/online_bayesian_normalization.py
--- a/paddlespeech/s2t/frontend/augmentor/resample.py
+++ b/paddlespeech/s2t/frontend/augmentor/resample.py
--- a/paddlespeech/s2t/frontend/augmentor/shift_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/shift_perturb.py
--- a/paddlespeech/s2t/frontend/augmentor/spec_augment.py
+++ b/paddlespeech/s2t/frontend/augmentor/spec_augment.py
--- a/paddlespeech/s2t/frontend/augmentor/speed_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/speed_perturb.py
--- a/paddlespeech/s2t/frontend/augmentor/volume_perturb.py
+++ b/paddlespeech/s2t/frontend/augmentor/volume_perturb.py
--- a/deepspeech/frontend/featurizer/__init__.py
+++ b/deepspeech/frontend/featurizer/__init__.py
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
--- a/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/speech_featurizer.py
--- a/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/text_featurizer.py
--- a/paddlespeech/s2t/frontend/normalizer.py
+++ b/paddlespeech/s2t/frontend/normalizer.py
--- a/paddlespeech/s2t/frontend/speech.py
+++ b/paddlespeech/s2t/frontend/speech.py
--- a/paddlespeech/s2t/frontend/utility.py
+++ b/paddlespeech/s2t/frontend/utility.py
--- a/deepspeech/models/lm/__init__.py
+++ b/deepspeech/models/lm/__init__.py
--- a/paddlespeech/s2t/io/batchfy.py
+++ b/paddlespeech/s2t/io/batchfy.py
--- a/paddlespeech/s2t/io/collator.py
+++ b/paddlespeech/s2t/io/collator.py
--- a/paddlespeech/s2t/io/converter.py
+++ b/paddlespeech/s2t/io/converter.py
--- a/paddlespeech/s2t/io/dataloader.py
+++ b/paddlespeech/s2t/io/dataloader.py
--- a/paddlespeech/s2t/io/dataset.py
+++ b/paddlespeech/s2t/io/dataset.py
--- a/paddlespeech/s2t/io/reader.py
+++ b/paddlespeech/s2t/io/reader.py
--- a/paddlespeech/s2t/io/sampler.py
+++ b/paddlespeech/s2t/io/sampler.py
--- a/paddlespeech/s2t/io/utility.py
+++ b/paddlespeech/s2t/io/utility.py
--- a/deepspeech/modules/__init__.py
+++ b/deepspeech/modules/__init__.py
--- a/paddlespeech/s2t/models/asr_interface.py
+++ b/paddlespeech/s2t/models/asr_interface.py
--- a/deepspeech/models/ds2/__init__.py
+++ b/deepspeech/models/ds2/__init__.py
--- a/paddlespeech/s2t/models/ds2/conv.py
+++ b/paddlespeech/s2t/models/ds2/conv.py
--- a/paddlespeech/s2t/models/ds2/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2/deepspeech2.py
--- a/paddlespeech/s2t/models/ds2/rnn.py
+++ b/paddlespeech/s2t/models/ds2/rnn.py
--- a/deepspeech/models/ds2_online/__init__.py
+++ b/deepspeech/models/ds2_online/__init__.py
--- a/paddlespeech/s2t/models/ds2_online/conv.py
+++ b/paddlespeech/s2t/models/ds2_online/conv.py
--- a/paddlespeech/s2t/models/ds2_online/deepspeech2.py
+++ b/paddlespeech/s2t/models/ds2_online/deepspeech2.py
--- a/deepspeech/training/__init__.py
+++ b/deepspeech/training/__init__.py
--- a/paddlespeech/s2t/models/lm/transformer.py
+++ b/paddlespeech/s2t/models/lm/transformer.py
--- a/paddlespeech/s2t/models/lm_interface.py
+++ b/paddlespeech/s2t/models/lm_interface.py
--- a/paddlespeech/s2t/models/st_interface.py
+++ b/paddlespeech/s2t/models/st_interface.py
--- a/deepspeech/models/u2/__init__.py
+++ b/deepspeech/models/u2/__init__.py
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
--- a/deepspeech/models/u2_st/__init__.py
+++ b/deepspeech/models/u2_st/__init__.py
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
--- a/deepspeech/training/triggers/__init__.py
+++ b/deepspeech/training/triggers/__init__.py
--- a/paddlespeech/s2t/modules/activation.py
+++ b/paddlespeech/s2t/modules/activation.py
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
--- a/paddlespeech/s2t/modules/cmvn.py
+++ b/paddlespeech/s2t/modules/cmvn.py
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
--- a/paddlespeech/s2t/modules/crf.py
+++ b/paddlespeech/s2t/modules/crf.py
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
--- a/paddlespeech/s2t/modules/embedding.py
+++ b/paddlespeech/s2t/modules/embedding.py
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
--- a/paddlespeech/s2t/modules/loss.py
+++ b/paddlespeech/s2t/modules/loss.py
--- a/paddlespeech/s2t/modules/mask.py
+++ b/paddlespeech/s2t/modules/mask.py
--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
--- a/deepspeech/training/updaters/__init__.py
+++ b/deepspeech/training/updaters/__init__.py
--- a/paddlespeech/s2t/training/cli.py
+++ b/paddlespeech/s2t/training/cli.py
--- a/deepspeech/training/extensions/__init__.py
+++ b/deepspeech/training/extensions/__init__.py
--- a/paddlespeech/s2t/training/extensions/evaluator.py
+++ b/paddlespeech/s2t/training/extensions/evaluator.py
--- a/deepspeech/training/extensions/extension.py
+++ b/deepspeech/training/extensions/extension.py
--- a/deepspeech/training/extensions/plot.py
+++ b/deepspeech/training/extensions/plot.py
--- a/paddlespeech/s2t/training/extensions/snapshot.py
+++ b/paddlespeech/s2t/training/extensions/snapshot.py
--- a/deepspeech/training/extensions/visualizer.py
+++ b/deepspeech/training/extensions/visualizer.py
--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
--- a/paddlespeech/s2t/training/optimizer.py
+++ b/paddlespeech/s2t/training/optimizer.py
--- a/deepspeech/training/reporter.py
+++ b/deepspeech/training/reporter.py
--- a/paddlespeech/s2t/training/scheduler.py
+++ b/paddlespeech/s2t/training/scheduler.py
--- a/paddlespeech/s2t/training/timer.py
+++ b/paddlespeech/s2t/training/timer.py
--- a/paddlespeech/s2t/training/trainer.py
+++ b/paddlespeech/s2t/training/trainer.py
--- a/deepspeech/transform/__init__.py
+++ b/deepspeech/transform/__init__.py
--- a/deepspeech/training/triggers/compare_value_trigger.py
+++ b/deepspeech/training/triggers/compare_value_trigger.py
--- a/deepspeech/training/triggers/interval_trigger.py
+++ b/deepspeech/training/triggers/interval_trigger.py
--- a/deepspeech/training/triggers/limit_trigger.py
+++ b/deepspeech/training/triggers/limit_trigger.py
--- a/deepspeech/training/triggers/time_trigger.py
+++ b/deepspeech/training/triggers/time_trigger.py
--- a/deepspeech/training/triggers/utils.py
+++ b/deepspeech/training/triggers/utils.py
--- a/deepspeech/utils/__init__.py
+++ b/deepspeech/utils/__init__.py
--- a/paddlespeech/s2t/training/updaters/standard_updater.py
+++ b/paddlespeech/s2t/training/updaters/standard_updater.py
--- a/paddlespeech/s2t/training/updaters/trainer.py
+++ b/paddlespeech/s2t/training/updaters/trainer.py
--- a/paddlespeech/s2t/training/updaters/updater.py
+++ b/paddlespeech/s2t/training/updaters/updater.py
--- a/paddleaudio/paddleaudio/models/__init__.py
+++ b/paddleaudio/paddleaudio/models/__init__.py
--- a/deepspeech/transform/add_deltas.py
+++ b/deepspeech/transform/add_deltas.py
--- a/deepspeech/transform/channel_selector.py
+++ b/deepspeech/transform/channel_selector.py
--- a/deepspeech/transform/cmvn.py
+++ b/deepspeech/transform/cmvn.py
--- a/paddlespeech/s2t/transform/functional.py
+++ b/paddlespeech/s2t/transform/functional.py
--- a/paddlespeech/s2t/transform/perturb.py
+++ b/paddlespeech/s2t/transform/perturb.py
--- a/paddlespeech/s2t/transform/spec_augment.py
+++ b/paddlespeech/s2t/transform/spec_augment.py
--- a/deepspeech/transform/spectrogram.py
+++ b/deepspeech/transform/spectrogram.py
--- a/deepspeech/transform/transform_interface.py
+++ b/deepspeech/transform/transform_interface.py
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
--- a/deepspeech/transform/wpe.py
+++ b/deepspeech/transform/wpe.py
--- a/parakeet/exps/gan_vocoder/multi_band_melgan/__init__.py
+++ b/parakeet/exps/gan_vocoder/multi_band_melgan/__init__.py
--- a/deepspeech/utils/asr_utils.py
+++ b/deepspeech/utils/asr_utils.py
--- a/deepspeech/utils/bleu_score.py
+++ b/deepspeech/utils/bleu_score.py
--- a/deepspeech/utils/check_kwargs.py
+++ b/deepspeech/utils/check_kwargs.py
--- a/paddlespeech/s2t/utils/checkpoint.py
+++ b/paddlespeech/s2t/utils/checkpoint.py
--- a/paddlespeech/s2t/utils/cli_readers.py
+++ b/paddlespeech/s2t/utils/cli_readers.py
--- a/deepspeech/utils/cli_utils.py
+++ b/deepspeech/utils/cli_utils.py
--- a/paddlespeech/s2t/utils/cli_writers.py
+++ b/paddlespeech/s2t/utils/cli_writers.py
--- a/paddlespeech/s2t/utils/ctc_utils.py
+++ b/paddlespeech/s2t/utils/ctc_utils.py
--- a/paddlespeech/s2t/utils/dynamic_import.py
+++ b/paddlespeech/s2t/utils/dynamic_import.py
--- a/deepspeech/utils/error_rate.py
+++ b/deepspeech/utils/error_rate.py
--- a/deepspeech/utils/layer_tools.py
+++ b/deepspeech/utils/layer_tools.py
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
--- a/deepspeech/utils/mp_tools.py
+++ b/deepspeech/utils/mp_tools.py
--- a/paddlespeech/s2t/utils/profiler.py
+++ b/paddlespeech/s2t/utils/profiler.py
--- a/paddlespeech/s2t/utils/socket_server.py
+++ b/paddlespeech/s2t/utils/socket_server.py
--- a/deepspeech/utils/spec_augment.py
+++ b/deepspeech/utils/spec_augment.py
--- a/paddlespeech/s2t/utils/tensor_utils.py
+++ b/paddlespeech/s2t/utils/tensor_utils.py
--- a/deepspeech/utils/text_grid.py
+++ b/deepspeech/utils/text_grid.py
--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
--- a/parakeet/__init__.py
+++ b/parakeet/__init__.py
--- a/parakeet/audio/__init__.py
+++ b/parakeet/audio/__init__.py
--- a/parakeet/audio/audio.py
+++ b/parakeet/audio/audio.py
--- a/parakeet/audio/spec_normalizer.py
+++ b/parakeet/audio/spec_normalizer.py
--- a/paddlespeech/t2s/data/__init__.py
+++ b/paddlespeech/t2s/data/__init__.py
--- a/parakeet/data/batch.py
+++ b/parakeet/data/batch.py
--- a/parakeet/data/dataset.py
+++ b/parakeet/data/dataset.py
--- a/parakeet/data/get_feats.py
+++ b/parakeet/data/get_feats.py
--- a/parakeet/datasets/__init__.py
+++ b/parakeet/datasets/__init__.py
--- a/paddlespeech/t2s/datasets/am_batch_fn.py
+++ b/paddlespeech/t2s/datasets/am_batch_fn.py
--- a/parakeet/datasets/common.py
+++ b/parakeet/datasets/common.py
--- a/parakeet/datasets/data_table.py
+++ b/parakeet/datasets/data_table.py
--- a/parakeet/datasets/ljspeech.py
+++ b/parakeet/datasets/ljspeech.py
--- a/parakeet/datasets/preprocess_utils.py
+++ b/parakeet/datasets/preprocess_utils.py
--- a/parakeet/datasets/vocoder_batch_fn.py
+++ b/parakeet/datasets/vocoder_batch_fn.py
--- a/parakeet/exps/__init__.py
+++ b/parakeet/exps/__init__.py
--- a/parakeet/exps/fastspeech2/__init__.py
+++ b/parakeet/exps/fastspeech2/__init__.py
--- a/paddlespeech/t2s/exps/fastspeech2/inference.py
+++ b/paddlespeech/t2s/exps/fastspeech2/inference.py
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e.py
--- a/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
--- a/paddlespeech/t2s/exps/fastspeech2/normalize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/normalize.py
--- a/paddlespeech/t2s/exps/fastspeech2/preprocess.py
+++ b/paddlespeech/t2s/exps/fastspeech2/preprocess.py
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize.py
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e.py
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_en.py
--- a/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/paddlespeech/t2s/exps/fastspeech2/synthesize_e2e_melgan.py
--- a/paddlespeech/t2s/exps/fastspeech2/train.py
+++ b/paddlespeech/t2s/exps/fastspeech2/train.py
--- a/parakeet/exps/gan_vocoder/README.md
+++ b/parakeet/exps/gan_vocoder/README.md
--- a/parakeet/exps/gan_vocoder/__init__.py
+++ b/parakeet/exps/gan_vocoder/__init__.py
--- a/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py
+++ b/parakeet/exps/gan_vocoder/parallelwave_gan/__init__.py
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/synthesize.py
--- a/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/multi_band_melgan/train.py
--- a/paddlespeech/t2s/exps/gan_vocoder/normalize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/normalize.py
--- a/parakeet/modules/fastspeech2_predictor/__init__.py
+++ b/parakeet/modules/fastspeech2_predictor/__init__.py
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize.py
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
--- a/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/parallelwave_gan/train.py
--- a/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
+++ b/paddlespeech/t2s/exps/gan_vocoder/preprocess.py
--- a/parakeet/exps/ge2e/__init__.py
+++ b/parakeet/exps/ge2e/__init__.py
--- a/parakeet/exps/ge2e/audio_processor.py
+++ b/parakeet/exps/ge2e/audio_processor.py
--- a/parakeet/exps/ge2e/config.py
+++ b/parakeet/exps/ge2e/config.py
--- a/paddlespeech/t2s/exps/ge2e/dataset_processors.py
+++ b/paddlespeech/t2s/exps/ge2e/dataset_processors.py
--- a/paddlespeech/t2s/exps/ge2e/inference.py
+++ b/paddlespeech/t2s/exps/ge2e/inference.py
--- a/paddlespeech/t2s/exps/ge2e/preprocess.py
+++ b/paddlespeech/t2s/exps/ge2e/preprocess.py
--- a/parakeet/exps/ge2e/random_cycle.py
+++ b/parakeet/exps/ge2e/random_cycle.py
--- a/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
+++ b/paddlespeech/t2s/exps/ge2e/speaker_verification_dataset.py
--- a/paddlespeech/t2s/exps/ge2e/train.py
+++ b/paddlespeech/t2s/exps/ge2e/train.py
--- a/parakeet/exps/sentences.txt
+++ b/parakeet/exps/sentences.txt
--- a/parakeet/exps/sentences_en.txt
+++ b/parakeet/exps/sentences_en.txt
--- a/parakeet/exps/speedyspeech/__init__.py
+++ b/parakeet/exps/speedyspeech/__init__.py
--- a/paddlespeech/t2s/exps/speedyspeech/inference.py
+++ b/paddlespeech/t2s/exps/speedyspeech/inference.py
--- a/paddlespeech/t2s/exps/speedyspeech/normalize.py
+++ b/paddlespeech/t2s/exps/speedyspeech/normalize.py
--- a/paddlespeech/t2s/exps/speedyspeech/preprocess.py
+++ b/paddlespeech/t2s/exps/speedyspeech/preprocess.py
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize.py
--- a/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/speedyspeech/synthesize_e2e.py
--- a/paddlespeech/t2s/exps/speedyspeech/train.py
+++ b/paddlespeech/t2s/exps/speedyspeech/train.py
--- a/parakeet/exps/tacotron2/__init__.py
+++ b/parakeet/exps/tacotron2/__init__.py
--- a/parakeet/exps/tacotron2/config.py
+++ b/parakeet/exps/tacotron2/config.py
--- a/paddlespeech/t2s/exps/tacotron2/ljspeech.py
+++ b/paddlespeech/t2s/exps/tacotron2/ljspeech.py
--- a/paddlespeech/t2s/exps/tacotron2/preprocess.py
+++ b/paddlespeech/t2s/exps/tacotron2/preprocess.py
--- a/paddlespeech/t2s/exps/tacotron2/synthesize.ipynb
+++ b/paddlespeech/t2s/exps/tacotron2/synthesize.ipynb
--- a/paddlespeech/t2s/exps/tacotron2/synthesize.py
+++ b/paddlespeech/t2s/exps/tacotron2/synthesize.py
--- a/paddlespeech/t2s/exps/tacotron2/train.py
+++ b/paddlespeech/t2s/exps/tacotron2/train.py
--- a/parakeet/exps/transformer_tts/__init__.py
+++ b/parakeet/exps/transformer_tts/__init__.py
--- a/paddlespeech/t2s/exps/transformer_tts/normalize.py
+++ b/paddlespeech/t2s/exps/transformer_tts/normalize.py
--- a/paddlespeech/t2s/exps/transformer_tts/preprocess.py
+++ b/paddlespeech/t2s/exps/transformer_tts/preprocess.py
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize.py
--- a/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
+++ b/paddlespeech/t2s/exps/transformer_tts/synthesize_e2e.py
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
--- a/parakeet/exps/voice_cloning/__init__.py
+++ b/parakeet/exps/voice_cloning/__init__.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/__init__.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/__init__.py
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/aishell3.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/aishell3.py
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/config.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/config.py
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/extract_mel.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/lexicon.txt
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/lexicon.txt
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/preprocess_transcription.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/process_wav.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/process_wav.py
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/train.py
--- a/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+++ b/paddlespeech/t2s/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
--- a/parakeet/exps/waveflow/__init__.py
+++ b/parakeet/exps/waveflow/__init__.py
--- a/parakeet/exps/waveflow/config.py
+++ b/parakeet/exps/waveflow/config.py
--- a/paddlespeech/t2s/exps/waveflow/ljspeech.py
+++ b/paddlespeech/t2s/exps/waveflow/ljspeech.py
--- a/paddlespeech/t2s/exps/waveflow/preprocess.py
+++ b/paddlespeech/t2s/exps/waveflow/preprocess.py
--- a/paddlespeech/t2s/exps/waveflow/synthesize.py
+++ b/paddlespeech/t2s/exps/waveflow/synthesize.py
--- a/paddlespeech/t2s/exps/waveflow/train.py
+++ b/paddlespeech/t2s/exps/waveflow/train.py
--- a/parakeet/frontend/__init__.py
+++ b/parakeet/frontend/__init__.py
--- a/paddlespeech/t2s/frontend/arpabet.py
+++ b/paddlespeech/t2s/frontend/arpabet.py
--- a/parakeet/frontend/generate_lexicon.py
+++ b/parakeet/frontend/generate_lexicon.py
--- a/paddlespeech/t2s/frontend/normalizer/__init__.py
+++ b/paddlespeech/t2s/frontend/normalizer/__init__.py
--- a/parakeet/frontend/normalizer/abbrrviation.py
+++ b/parakeet/frontend/normalizer/abbrrviation.py
--- a/parakeet/frontend/normalizer/acronyms.py
+++ b/parakeet/frontend/normalizer/acronyms.py
--- a/paddlespeech/t2s/frontend/normalizer/normalizer.py
+++ b/paddlespeech/t2s/frontend/normalizer/normalizer.py
--- a/parakeet/frontend/normalizer/numbers.py
+++ b/parakeet/frontend/normalizer/numbers.py
--- a/parakeet/frontend/normalizer/width.py
+++ b/parakeet/frontend/normalizer/width.py
--- a/paddlespeech/t2s/frontend/phonectic.py
+++ b/paddlespeech/t2s/frontend/phonectic.py
--- a/paddlespeech/t2s/frontend/pinyin.py
+++ b/paddlespeech/t2s/frontend/pinyin.py
--- a/parakeet/frontend/punctuation.py
+++ b/parakeet/frontend/punctuation.py
--- a/parakeet/frontend/tone_sandhi.py
+++ b/parakeet/frontend/tone_sandhi.py
--- a/parakeet/frontend/vocab.py
+++ b/parakeet/frontend/vocab.py
--- a/paddlespeech/t2s/frontend/zh_frontend.py
+++ b/paddlespeech/t2s/frontend/zh_frontend.py
--- a/parakeet/frontend/zh_normalization/README.md
+++ b/parakeet/frontend/zh_normalization/README.md
--- a/paddlespeech/t2s/frontend/zh_normalization/__init__.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/__init__.py
--- a/parakeet/frontend/zh_normalization/char_convert.py
+++ b/parakeet/frontend/zh_normalization/char_convert.py
--- a/parakeet/frontend/zh_normalization/chronology.py
+++ b/parakeet/frontend/zh_normalization/chronology.py
--- a/parakeet/frontend/zh_normalization/constants.py
+++ b/parakeet/frontend/zh_normalization/constants.py
--- a/parakeet/frontend/zh_normalization/num.py
+++ b/parakeet/frontend/zh_normalization/num.py
--- a/parakeet/frontend/zh_normalization/phonecode.py
+++ b/parakeet/frontend/zh_normalization/phonecode.py
--- a/parakeet/frontend/zh_normalization/quantifier.py
+++ b/parakeet/frontend/zh_normalization/quantifier.py
--- a/parakeet/frontend/zh_normalization/text_normlization.py
+++ b/parakeet/frontend/zh_normalization/text_normlization.py
--- a/parakeet/models/__init__.py
+++ b/parakeet/models/__init__.py
--- a/parakeet/models/fastspeech2/__init__.py
+++ b/parakeet/models/fastspeech2/__init__.py
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2_updater.py
--- a/parakeet/models/lstm_speaker_encoder.py
+++ b/parakeet/models/lstm_speaker_encoder.py
--- a/parakeet/models/melgan/__init__.py
+++ b/parakeet/models/melgan/__init__.py
--- a/paddlespeech/t2s/models/melgan/melgan.py
+++ b/paddlespeech/t2s/models/melgan/melgan.py
--- a/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
+++ b/paddlespeech/t2s/models/melgan/multi_band_melgan_updater.py
--- a/parakeet/models/parallel_wavegan/__init__.py
+++ b/parakeet/models/parallel_wavegan/__init__.py
--- a/parakeet/models/parallel_wavegan/parallel_wavegan.py
+++ b/parakeet/models/parallel_wavegan/parallel_wavegan.py
--- a/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
+++ b/paddlespeech/t2s/models/parallel_wavegan/parallel_wavegan_updater.py
--- a/parakeet/models/speedyspeech/__init__.py
+++ b/parakeet/models/speedyspeech/__init__.py
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech.py
--- a/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
+++ b/paddlespeech/t2s/models/speedyspeech/speedyspeech_updater.py
--- a/paddlespeech/t2s/models/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2.py
--- a/parakeet/models/transformer_tts/__init__.py
+++ b/parakeet/models/transformer_tts/__init__.py
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts.py
--- a/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
+++ b/paddlespeech/t2s/models/transformer_tts/transformer_tts_updater.py
--- a/paddlespeech/t2s/models/waveflow.py
+++ b/paddlespeech/t2s/models/waveflow.py
--- a/parakeet/modules/__init__.py
+++ b/parakeet/modules/__init__.py
--- a/parakeet/modules/adversarial_loss.py
+++ b/parakeet/modules/adversarial_loss.py
--- a/parakeet/modules/attention.py
+++ b/parakeet/modules/attention.py
--- a/parakeet/modules/audio.py
+++ b/parakeet/modules/audio.py
--- a/parakeet/modules/causal_conv.py
+++ b/parakeet/modules/causal_conv.py
--- a/parakeet/modules/conv.py
+++ b/parakeet/modules/conv.py
--- a/parakeet/modules/expansion.py
+++ b/parakeet/modules/expansion.py
--- a/text_processing/speechtask/punctuation_restoration/io/__init__.py
+++ b/text_processing/speechtask/punctuation_restoration/io/__init__.py
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/duration_predictor.py
--- a/parakeet/modules/fastspeech2_predictor/length_regulator.py
+++ b/parakeet/modules/fastspeech2_predictor/length_regulator.py
--- a/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
+++ b/paddlespeech/t2s/modules/fastspeech2_predictor/variance_predictor.py
--- a/parakeet/modules/fastspeech2_transformer/__init__.py
+++ b/parakeet/modules/fastspeech2_transformer/__init__.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/attention.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/decoder.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/decoder_layer.py
--- a/parakeet/modules/fastspeech2_transformer/embedding.py
+++ b/parakeet/modules/fastspeech2_transformer/embedding.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/encoder.py
--- a/parakeet/modules/fastspeech2_transformer/encoder_layer.py
+++ b/parakeet/modules/fastspeech2_transformer/encoder_layer.py
--- a/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
+++ b/paddlespeech/t2s/modules/fastspeech2_transformer/lightconv.py
--- a/parakeet/modules/fastspeech2_transformer/mask.py
+++ b/parakeet/modules/fastspeech2_transformer/mask.py
--- a/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py
+++ b/parakeet/modules/fastspeech2_transformer/multi_layer_conv.py
--- a/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py
+++ b/parakeet/modules/fastspeech2_transformer/positionwise_feed_forward.py
--- a/parakeet/modules/fastspeech2_transformer/repeat.py
+++ b/parakeet/modules/fastspeech2_transformer/repeat.py
--- a/parakeet/modules/geometry.py
+++ b/parakeet/modules/geometry.py
--- a/parakeet/modules/glu.py
+++ b/parakeet/modules/glu.py
--- a/parakeet/modules/layer_norm.py
+++ b/parakeet/modules/layer_norm.py
--- a/parakeet/modules/losses.py
+++ b/parakeet/modules/losses.py
--- a/parakeet/modules/masked_fill.py
+++ b/parakeet/modules/masked_fill.py
--- a/parakeet/modules/masking.py
+++ b/parakeet/modules/masking.py
--- a/parakeet/modules/nets_utils.py
+++ b/parakeet/modules/nets_utils.py
--- a/parakeet/modules/normalizer.py
+++ b/parakeet/modules/normalizer.py
--- a/parakeet/modules/positional_encoding.py
+++ b/parakeet/modules/positional_encoding.py
--- a/parakeet/modules/pqmf.py
+++ b/parakeet/modules/pqmf.py
--- a/paddlespeech/t2s/modules/residual_stack.py
+++ b/paddlespeech/t2s/modules/residual_stack.py
--- a/parakeet/modules/ssim.py
+++ b/parakeet/modules/ssim.py
--- a/parakeet/modules/stft_loss.py
+++ b/parakeet/modules/stft_loss.py
--- a/paddlespeech/t2s/modules/style_encoder.py
+++ b/paddlespeech/t2s/modules/style_encoder.py
--- a/parakeet/modules/tacotron2/__init__.py
+++ b/parakeet/modules/tacotron2/__init__.py
--- a/parakeet/modules/tacotron2/decoder.py
+++ b/parakeet/modules/tacotron2/decoder.py
--- a/parakeet/modules/tacotron2/encoder.py
+++ b/parakeet/modules/tacotron2/encoder.py
--- a/paddlespeech/t2s/modules/transformer.py
+++ b/paddlespeech/t2s/modules/transformer.py
--- a/parakeet/training/__init__.py
+++ b/parakeet/training/__init__.py
--- a/paddlespeech/t2s/training/cli.py
+++ b/paddlespeech/t2s/training/cli.py
--- a/parakeet/training/default_config.py
+++ b/parakeet/training/default_config.py
--- a/paddlespeech/t2s/training/experiment.py
+++ b/paddlespeech/t2s/training/experiment.py
--- a/parakeet/training/extension.py
+++ b/parakeet/training/extension.py
--- a/parakeet/training/extensions/__init__.py
+++ b/parakeet/training/extensions/__init__.py
--- a/paddlespeech/t2s/training/extensions/evaluator.py
+++ b/paddlespeech/t2s/training/extensions/evaluator.py
--- a/paddlespeech/t2s/training/extensions/snapshot.py
+++ b/paddlespeech/t2s/training/extensions/snapshot.py
--- a/paddlespeech/t2s/training/extensions/visualizer.py
+++ b/paddlespeech/t2s/training/extensions/visualizer.py
--- a/parakeet/training/optimizer.py
+++ b/parakeet/training/optimizer.py
--- a/parakeet/training/reporter.py
+++ b/parakeet/training/reporter.py
--- a/parakeet/training/seeding.py
+++ b/parakeet/training/seeding.py
--- a/paddlespeech/t2s/training/trainer.py
+++ b/paddlespeech/t2s/training/trainer.py
--- a/paddlespeech/t2s/training/trigger.py
+++ b/paddlespeech/t2s/training/trigger.py
--- a/parakeet/training/triggers/__init__.py
+++ b/parakeet/training/triggers/__init__.py
--- a/parakeet/training/triggers/interval_trigger.py
+++ b/parakeet/training/triggers/interval_trigger.py
--- a/parakeet/training/triggers/limit_trigger.py
+++ b/parakeet/training/triggers/limit_trigger.py
--- a/parakeet/training/triggers/time_trigger.py
+++ b/parakeet/training/triggers/time_trigger.py
--- a/parakeet/training/updater.py
+++ b/parakeet/training/updater.py
--- a/parakeet/training/updaters/__init__.py
+++ b/parakeet/training/updaters/__init__.py
--- a/paddlespeech/t2s/training/updaters/standard_updater.py
+++ b/paddlespeech/t2s/training/updaters/standard_updater.py
--- a/parakeet/utils/__init__.py
+++ b/parakeet/utils/__init__.py
--- a/paddlespeech/t2s/utils/checkpoint.py
+++ b/paddlespeech/t2s/utils/checkpoint.py
--- a/parakeet/utils/display.py
+++ b/parakeet/utils/display.py
--- a/parakeet/utils/error_rate.py
+++ b/parakeet/utils/error_rate.py
--- a/parakeet/utils/h5_utils.py
+++ b/parakeet/utils/h5_utils.py
--- a/parakeet/utils/internals.py
+++ b/parakeet/utils/internals.py
--- a/parakeet/utils/layer_tools.py
+++ b/parakeet/utils/layer_tools.py
--- a/parakeet/utils/mp_tools.py
+++ b/parakeet/utils/mp_tools.py
--- a/parakeet/utils/profile.py
+++ b/parakeet/utils/profile.py
--- a/parakeet/utils/profiler.py
+++ b/parakeet/utils/profiler.py
--- a/parakeet/utils/scheduler.py
+++ b/parakeet/utils/scheduler.py
--- a/parakeet/utils/timeline.py
+++ b/parakeet/utils/timeline.py
--- a/text_processing/.gitignore
+++ b/text_processing/.gitignore
--- a/text_processing/README.md
+++ b/text_processing/README.md
--- a/text_processing/examples/punctuation_restoration/chinese/README.md
+++ b/text_processing/examples/punctuation_restoration/chinese/README.md
--- a/text_processing/examples/punctuation_restoration/chinese/conf/blstm.yaml
+++ b/text_processing/examples/punctuation_restoration/chinese/conf/blstm.yaml
--- a/text_processing/examples/punctuation_restoration/chinese/conf/data_conf/chinese.yaml
+++ b/text_processing/examples/punctuation_restoration/chinese/conf/data_conf/chinese.yaml
--- a/text_processing/examples/punctuation_restoration/chinese/conf/train_conf/bertBLSTM_zh.yaml
+++ b/text_processing/examples/punctuation_restoration/chinese/conf/train_conf/bertBLSTM_zh.yaml
--- a/text_processing/examples/punctuation_restoration/chinese/conf/train_conf/bertLinear_zh.yaml
+++ b/text_processing/examples/punctuation_restoration/chinese/conf/train_conf/bertLinear_zh.yaml
--- a/text_processing/examples/punctuation_restoration/chinese/local/avg.sh
+++ b/text_processing/examples/punctuation_restoration/chinese/local/avg.sh
--- a/text_processing/examples/punctuation_restoration/chinese/local/data.sh
+++ b/text_processing/examples/punctuation_restoration/chinese/local/data.sh
--- a/text_processing/examples/punctuation_restoration/chinese/local/test.sh
+++ b/text_processing/examples/punctuation_restoration/chinese/local/test.sh
--- a/text_processing/examples/punctuation_restoration/chinese/local/train.sh
+++ b/text_processing/examples/punctuation_restoration/chinese/local/train.sh
--- a/text_processing/examples/punctuation_restoration/chinese/path.sh
+++ b/text_processing/examples/punctuation_restoration/chinese/path.sh
--- a/text_processing/examples/punctuation_restoration/chinese/run.sh
+++ b/text_processing/examples/punctuation_restoration/chinese/run.sh
--- a/text_processing/examples/punctuation_restoration/english/README.md
+++ b/text_processing/examples/punctuation_restoration/english/README.md
--- a/text_processing/examples/punctuation_restoration/english/conf/data_conf/english.yaml
+++ b/text_processing/examples/punctuation_restoration/english/conf/data_conf/english.yaml
--- a/text_processing/examples/punctuation_restoration/english/conf/train_conf/bertBLSTM_base_en.yaml
+++ b/text_processing/examples/punctuation_restoration/english/conf/train_conf/bertBLSTM_base_en.yaml
--- a/text_processing/examples/punctuation_restoration/english/conf/train_conf/bertLinear_en.yaml
+++ b/text_processing/examples/punctuation_restoration/english/conf/train_conf/bertLinear_en.yaml
--- a/text_processing/examples/punctuation_restoration/english/local/avg.sh
+++ b/text_processing/examples/punctuation_restoration/english/local/avg.sh
--- a/text_processing/examples/punctuation_restoration/english/local/data.sh
+++ b/text_processing/examples/punctuation_restoration/english/local/data.sh
--- a/text_processing/examples/punctuation_restoration/english/local/test.sh
+++ b/text_processing/examples/punctuation_restoration/english/local/test.sh
--- a/text_processing/examples/punctuation_restoration/english/local/train.sh
+++ b/text_processing/examples/punctuation_restoration/english/local/train.sh
--- a/text_processing/examples/punctuation_restoration/english/path.sh
+++ b/text_processing/examples/punctuation_restoration/english/path.sh
--- a/text_processing/examples/punctuation_restoration/english/run.sh
+++ b/text_processing/examples/punctuation_restoration/english/run.sh
--- a/text_processing/requirements.txt
+++ b/text_processing/requirements.txt
--- a/text_processing/speechtask/punctuation_restoration/bin/avg_model.py
+++ b/text_processing/speechtask/punctuation_restoration/bin/avg_model.py
--- a/text_processing/speechtask/punctuation_restoration/bin/pre_data.py
+++ b/text_processing/speechtask/punctuation_restoration/bin/pre_data.py
--- a/text_processing/speechtask/punctuation_restoration/bin/test.py
+++ b/text_processing/speechtask/punctuation_restoration/bin/test.py
--- a/text_processing/speechtask/punctuation_restoration/bin/train.py
+++ b/text_processing/speechtask/punctuation_restoration/bin/train.py
--- a/text_processing/speechtask/punctuation_restoration/modules/__init__.py
+++ b/text_processing/speechtask/punctuation_restoration/modules/__init__.py
--- a/text_processing/speechtask/punctuation_restoration/io/collator.py
+++ b/text_processing/speechtask/punctuation_restoration/io/collator.py
--- a/text_processing/speechtask/punctuation_restoration/io/common.py
+++ b/text_processing/speechtask/punctuation_restoration/io/common.py
--- a/text_processing/speechtask/punctuation_restoration/io/dataset.py
+++ b/text_processing/speechtask/punctuation_restoration/io/dataset.py
--- a/text_processing/speechtask/punctuation_restoration/model/BertBLSTM.py
+++ b/text_processing/speechtask/punctuation_restoration/model/BertBLSTM.py
--- a/text_processing/speechtask/punctuation_restoration/model/BertLinear.py
+++ b/text_processing/speechtask/punctuation_restoration/model/BertLinear.py
--- a/text_processing/speechtask/punctuation_restoration/model/blstm.py
+++ b/text_processing/speechtask/punctuation_restoration/model/blstm.py
--- a/text_processing/speechtask/punctuation_restoration/model/lstm.py
+++ b/text_processing/speechtask/punctuation_restoration/model/lstm.py
--- a/text_processing/speechtask/punctuation_restoration/training/__init__.py
+++ b/text_processing/speechtask/punctuation_restoration/training/__init__.py
--- a/text_processing/speechtask/punctuation_restoration/modules/activation.py
+++ b/text_processing/speechtask/punctuation_restoration/modules/activation.py
--- a/text_processing/speechtask/punctuation_restoration/modules/attention.py
+++ b/text_processing/speechtask/punctuation_restoration/modules/attention.py
--- a/text_processing/speechtask/punctuation_restoration/modules/crf.py
+++ b/text_processing/speechtask/punctuation_restoration/modules/crf.py
--- a/text_processing/speechtask/punctuation_restoration/utils/__init__.py
+++ b/text_processing/speechtask/punctuation_restoration/utils/__init__.py
--- a/text_processing/speechtask/punctuation_restoration/training/loss.py
+++ b/text_processing/speechtask/punctuation_restoration/training/loss.py
--- a/text_processing/speechtask/punctuation_restoration/training/trainer.py
+++ b/text_processing/speechtask/punctuation_restoration/training/trainer.py
--- a/paddlespeech/text/speechtask/punctuation_restoration/utils/__init__.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/utils/__init__.py
--- a/text_processing/speechtask/punctuation_restoration/utils/checkpoint.py
+++ b/text_processing/speechtask/punctuation_restoration/utils/checkpoint.py
--- a/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
+++ b/paddlespeech/text/speechtask/punctuation_restoration/utils/default_parser.py
--- a/text_processing/speechtask/punctuation_restoration/utils/layer_tools.py
+++ b/text_processing/speechtask/punctuation_restoration/utils/layer_tools.py
--- a/text_processing/speechtask/punctuation_restoration/utils/mp_tools.py
+++ b/text_processing/speechtask/punctuation_restoration/utils/mp_tools.py
--- a/text_processing/speechtask/punctuation_restoration/utils/punct_pre.py
+++ b/text_processing/speechtask/punctuation_restoration/utils/punct_pre.py
--- a/text_processing/speechtask/punctuation_restoration/utils/utility.py
+++ b/text_processing/speechtask/punctuation_restoration/utils/utility.py
--- a/parakeet/data/__init__.py
+++ b/parakeet/data/__init__.py
--- a/parakeet/datasets/am_batch_fn.py
+++ b/parakeet/datasets/am_batch_fn.py
--- a/parakeet/exps/fastspeech2/inference.py
+++ b/parakeet/exps/fastspeech2/inference.py
--- a/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py
+++ b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e.py
--- a/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
+++ b/parakeet/exps/fastspeech2/multi_spk_synthesize_e2e_en.py
--- a/parakeet/exps/fastspeech2/normalize.py
+++ b/parakeet/exps/fastspeech2/normalize.py
--- a/parakeet/exps/fastspeech2/preprocess.py
+++ b/parakeet/exps/fastspeech2/preprocess.py
--- a/parakeet/exps/fastspeech2/synthesize.py
+++ b/parakeet/exps/fastspeech2/synthesize.py
--- a/parakeet/exps/fastspeech2/synthesize_e2e.py
+++ b/parakeet/exps/fastspeech2/synthesize_e2e.py
--- a/parakeet/exps/fastspeech2/synthesize_e2e_en.py
+++ b/parakeet/exps/fastspeech2/synthesize_e2e_en.py
--- a/parakeet/exps/fastspeech2/synthesize_e2e_melgan.py
+++ b/parakeet/exps/fastspeech2/synthesize_e2e_melgan.py
--- a/parakeet/exps/fastspeech2/train.py
+++ b/parakeet/exps/fastspeech2/train.py
--- a/parakeet/exps/gan_vocoder/multi_band_melgan/synthesize.py
+++ b/parakeet/exps/gan_vocoder/multi_band_melgan/synthesize.py
--- a/parakeet/exps/gan_vocoder/multi_band_melgan/train.py
+++ b/parakeet/exps/gan_vocoder/multi_band_melgan/train.py
--- a/parakeet/exps/gan_vocoder/normalize.py
+++ b/parakeet/exps/gan_vocoder/normalize.py
--- a/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py
+++ b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize.py
--- a/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
+++ b/parakeet/exps/gan_vocoder/parallelwave_gan/synthesize_from_wav.py
--- a/parakeet/exps/gan_vocoder/parallelwave_gan/train.py
+++ b/parakeet/exps/gan_vocoder/parallelwave_gan/train.py
--- a/parakeet/exps/gan_vocoder/preprocess.py
+++ b/parakeet/exps/gan_vocoder/preprocess.py
--- a/parakeet/exps/ge2e/dataset_processors.py
+++ b/parakeet/exps/ge2e/dataset_processors.py
--- a/parakeet/exps/ge2e/inference.py
+++ b/parakeet/exps/ge2e/inference.py
--- a/parakeet/exps/ge2e/preprocess.py
+++ b/parakeet/exps/ge2e/preprocess.py
--- a/parakeet/exps/ge2e/speaker_verification_dataset.py
+++ b/parakeet/exps/ge2e/speaker_verification_dataset.py
--- a/parakeet/exps/ge2e/train.py
+++ b/parakeet/exps/ge2e/train.py
--- a/parakeet/exps/speedyspeech/inference.py
+++ b/parakeet/exps/speedyspeech/inference.py
--- a/parakeet/exps/speedyspeech/normalize.py
+++ b/parakeet/exps/speedyspeech/normalize.py
--- a/parakeet/exps/speedyspeech/preprocess.py
+++ b/parakeet/exps/speedyspeech/preprocess.py
--- a/parakeet/exps/speedyspeech/synthesize.py
+++ b/parakeet/exps/speedyspeech/synthesize.py
--- a/parakeet/exps/speedyspeech/synthesize_e2e.py
+++ b/parakeet/exps/speedyspeech/synthesize_e2e.py
--- a/parakeet/exps/speedyspeech/train.py
+++ b/parakeet/exps/speedyspeech/train.py
--- a/parakeet/exps/tacotron2/ljspeech.py
+++ b/parakeet/exps/tacotron2/ljspeech.py
--- a/parakeet/exps/tacotron2/preprocess.py
+++ b/parakeet/exps/tacotron2/preprocess.py
--- a/parakeet/exps/tacotron2/synthesize.ipynb
+++ b/parakeet/exps/tacotron2/synthesize.ipynb
--- a/parakeet/exps/tacotron2/synthesize.py
+++ b/parakeet/exps/tacotron2/synthesize.py
--- a/parakeet/exps/tacotron2/train.py
+++ b/parakeet/exps/tacotron2/train.py
--- a/parakeet/exps/transformer_tts/normalize.py
+++ b/parakeet/exps/transformer_tts/normalize.py
--- a/parakeet/exps/transformer_tts/preprocess.py
+++ b/parakeet/exps/transformer_tts/preprocess.py
--- a/parakeet/exps/transformer_tts/synthesize.py
+++ b/parakeet/exps/transformer_tts/synthesize.py
--- a/parakeet/exps/transformer_tts/synthesize_e2e.py
+++ b/parakeet/exps/transformer_tts/synthesize_e2e.py
--- a/parakeet/exps/transformer_tts/train.py
+++ b/parakeet/exps/transformer_tts/train.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/aishell3.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/aishell3.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/chinese_g2p.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/extract_mel.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/extract_mel.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/train.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/train.py
--- a/parakeet/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
+++ b/parakeet/exps/voice_cloning/tacotron2_ge2e/voice_cloning.py
--- a/parakeet/exps/waveflow/ljspeech.py
+++ b/parakeet/exps/waveflow/ljspeech.py
--- a/parakeet/exps/waveflow/preprocess.py
+++ b/parakeet/exps/waveflow/preprocess.py
--- a/parakeet/exps/waveflow/synthesize.py
+++ b/parakeet/exps/waveflow/synthesize.py
--- a/parakeet/exps/waveflow/train.py
+++ b/parakeet/exps/waveflow/train.py
--- a/parakeet/frontend/arpabet.py
+++ b/parakeet/frontend/arpabet.py
--- a/parakeet/frontend/normalizer/__init__.py
+++ b/parakeet/frontend/normalizer/__init__.py
--- a/parakeet/frontend/normalizer/normalizer.py
+++ b/parakeet/frontend/normalizer/normalizer.py
--- a/parakeet/frontend/phonectic.py
+++ b/parakeet/frontend/phonectic.py
--- a/parakeet/frontend/pinyin.py
+++ b/parakeet/frontend/pinyin.py
--- a/parakeet/frontend/zh_frontend.py
+++ b/parakeet/frontend/zh_frontend.py
--- a/parakeet/frontend/zh_normalization/__init__.py
+++ b/parakeet/frontend/zh_normalization/__init__.py
--- a/parakeet/models/fastspeech2/fastspeech2.py
+++ b/parakeet/models/fastspeech2/fastspeech2.py
--- a/parakeet/models/fastspeech2/fastspeech2_updater.py
+++ b/parakeet/models/fastspeech2/fastspeech2_updater.py
--- a/parakeet/models/melgan/melgan.py
+++ b/parakeet/models/melgan/melgan.py
--- a/parakeet/models/melgan/multi_band_melgan_updater.py
+++ b/parakeet/models/melgan/multi_band_melgan_updater.py
--- a/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py
+++ b/parakeet/models/parallel_wavegan/parallel_wavegan_updater.py
--- a/parakeet/models/speedyspeech/speedyspeech.py
+++ b/parakeet/models/speedyspeech/speedyspeech.py
--- a/parakeet/models/speedyspeech/speedyspeech_updater.py
+++ b/parakeet/models/speedyspeech/speedyspeech_updater.py
--- a/parakeet/models/tacotron2.py
+++ b/parakeet/models/tacotron2.py
--- a/parakeet/models/transformer_tts/transformer_tts.py
+++ b/parakeet/models/transformer_tts/transformer_tts.py
--- a/parakeet/models/transformer_tts/transformer_tts_updater.py
+++ b/parakeet/models/transformer_tts/transformer_tts_updater.py
--- a/parakeet/models/waveflow.py
+++ b/parakeet/models/waveflow.py
--- a/parakeet/modules/fastspeech2_predictor/duration_predictor.py
+++ b/parakeet/modules/fastspeech2_predictor/duration_predictor.py
--- a/parakeet/modules/fastspeech2_predictor/variance_predictor.py
+++ b/parakeet/modules/fastspeech2_predictor/variance_predictor.py
--- a/parakeet/modules/fastspeech2_transformer/attention.py
+++ b/parakeet/modules/fastspeech2_transformer/attention.py
--- a/parakeet/modules/fastspeech2_transformer/decoder.py
+++ b/parakeet/modules/fastspeech2_transformer/decoder.py
--- a/parakeet/modules/fastspeech2_transformer/decoder_layer.py
+++ b/parakeet/modules/fastspeech2_transformer/decoder_layer.py
--- a/parakeet/modules/fastspeech2_transformer/encoder.py
+++ b/parakeet/modules/fastspeech2_transformer/encoder.py
--- a/parakeet/modules/fastspeech2_transformer/lightconv.py
+++ b/parakeet/modules/fastspeech2_transformer/lightconv.py
--- a/parakeet/modules/residual_stack.py
+++ b/parakeet/modules/residual_stack.py
--- a/parakeet/modules/style_encoder.py
+++ b/parakeet/modules/style_encoder.py
--- a/parakeet/modules/transformer.py
+++ b/parakeet/modules/transformer.py
--- a/parakeet/training/cli.py
+++ b/parakeet/training/cli.py
--- a/parakeet/training/experiment.py
+++ b/parakeet/training/experiment.py
--- a/parakeet/training/extensions/evaluator.py
+++ b/parakeet/training/extensions/evaluator.py
--- a/parakeet/training/extensions/snapshot.py
+++ b/parakeet/training/extensions/snapshot.py
--- a/parakeet/training/extensions/visualizer.py
+++ b/parakeet/training/extensions/visualizer.py
--- a/parakeet/training/trainer.py
+++ b/parakeet/training/trainer.py
--- a/parakeet/training/trigger.py
+++ b/parakeet/training/trigger.py
--- a/parakeet/training/updaters/standard_updater.py
+++ b/parakeet/training/updaters/standard_updater.py
--- a/parakeet/utils/checkpoint.py
+++ b/parakeet/utils/checkpoint.py
--- a/setup.py
+++ b/setup.py
--- a/setup.sh
+++ b/setup.sh
--- a/env.sh
+++ b/env.sh
--- a/tests/benchmark/pwgan/README.md
+++ b/tests/benchmark/pwgan/README.md
--- a/tests/benchmark/pwgan/run_all.sh
+++ b/tests/benchmark/pwgan/run_all.sh
--- a/tests/benchmark/pwgan/run_benchmark.sh
+++ b/tests/benchmark/pwgan/run_benchmark.sh
--- a/tests/chains/ds2/ds2_params_lite_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_lite_train_infer.txt
--- a/tests/chains/ds2/ds2_params_whole_train_infer.txt
+++ b/tests/chains/ds2/ds2_params_whole_train_infer.txt
--- a/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_multi_gpu.txt
--- a/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_lite_single_gpu.txt
--- a/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_multi_gpu.txt
--- a/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
+++ b/tests/chains/speedyspeech/speedyspeech_params_whole_single_gpu.txt
--- a/tests/unit/asr/deepspeech2_model_test.py
+++ b/tests/unit/asr/deepspeech2_model_test.py
--- a/tests/unit/asr/deepspeech2_online_model_test.py
+++ b/tests/unit/asr/deepspeech2_online_model_test.py
--- a/tests/unit/asr/error_rate_test.py
+++ b/tests/unit/asr/error_rate_test.py
--- a/tests/unit/asr/mask_test.py
+++ b/tests/unit/asr/mask_test.py
--- a/tests/unit/asr/u2_model_test.py
+++ b/tests/unit/asr/u2_model_test.py
--- a/tests/unit/tts/test_data_table.py
+++ b/tests/unit/tts/test_data_table.py
--- a/tests/unit/tts/test_expansion.py
+++ b/tests/unit/tts/test_expansion.py
--- a/tests/unit/tts/test_pwg.py
+++ b/tests/unit/tts/test_pwg.py
--- a/tests/unit/tts/test_reporter.py
+++ b/tests/unit/tts/test_reporter.py
--- a/tests/unit/tts/test_snapshot.py
+++ b/tests/unit/tts/test_snapshot.py
--- a/tests/unit/tts/test_stft.py
+++ b/tests/unit/tts/test_stft.py
--- a/text_processing/speechtask/punctuation_restoration/utils/default_parser.py
+++ b/text_processing/speechtask/punctuation_restoration/utils/default_parser.py
--- a/third_party/README.md
+++ b/third_party/README.md
--- a/third_party/chinese_text_normalization/.gitignore
+++ b/third_party/chinese_text_normalization/.gitignore
--- a/third_party/chinese_text_normalization/LICENSE
+++ b/third_party/chinese_text_normalization/LICENSE
--- a/third_party/chinese_text_normalization/README.md
+++ b/third_party/chinese_text_normalization/README.md
--- a/third_party/chinese_text_normalization/python/cn_tn.py
+++ b/third_party/chinese_text_normalization/python/cn_tn.py
--- a/third_party/chinese_text_normalization/python/example_kaldi.txt
+++ b/third_party/chinese_text_normalization/python/example_kaldi.txt
--- a/third_party/chinese_text_normalization/python/example_plain.txt
+++ b/third_party/chinese_text_normalization/python/example_plain.txt
--- a/third_party/chinese_text_normalization/python/run.sh
+++ b/third_party/chinese_text_normalization/python/run.sh
--- a/third_party/chinese_text_normalization/thrax/INSTALL.txt
+++ b/third_party/chinese_text_normalization/thrax/INSTALL.txt
--- a/third_party/chinese_text_normalization/thrax/install_thrax.sh
+++ b/third_party/chinese_text_normalization/thrax/install_thrax.sh
--- a/third_party/chinese_text_normalization/thrax/papers/gorman-sproat-2016.pdf
+++ b/third_party/chinese_text_normalization/thrax/papers/gorman-sproat-2016.pdf
--- a/third_party/chinese_text_normalization/thrax/papers/wu-etal-2016.pdf
+++ b/third_party/chinese_text_normalization/thrax/papers/wu-etal-2016.pdf
--- a/third_party/chinese_text_normalization/thrax/run_cn.sh
+++ b/third_party/chinese_text_normalization/thrax/run_cn.sh
--- a/third_party/chinese_text_normalization/thrax/run_en.sh
+++ b/third_party/chinese_text_normalization/thrax/run_en.sh
--- a/third_party/chinese_text_normalization/thrax/src/LICENSE
+++ b/third_party/chinese_text_normalization/thrax/src/LICENSE
--- a/third_party/chinese_text_normalization/thrax/src/Makefile
+++ b/third_party/chinese_text_normalization/thrax/src/Makefile
--- a/third_party/chinese_text_normalization/thrax/src/README.md
+++ b/third_party/chinese_text_normalization/thrax/src/README.md
--- a/third_party/chinese_text_normalization/thrax/src/cn/Makefile
+++ b/third_party/chinese_text_normalization/thrax/src/cn/Makefile
--- a/third_party/chinese_text_normalization/thrax/src/cn/amount.grm
+++ b/third_party/chinese_text_normalization/thrax/src/cn/amount.grm
--- a/third_party/chinese_text_normalization/thrax/src/cn/byte.grm
+++ b/third_party/chinese_text_normalization/thrax/src/cn/byte.grm
--- a/third_party/chinese_text_normalization/thrax/src/cn/date.grm
+++ b/third_party/chinese_text_normalization/thrax/src/cn/date.grm
--- a/third_party/chinese_text_normalization/thrax/src/cn/hotfix.grm
+++ b/third_party/chinese_text_normalization/thrax/src/cn/hotfix.grm
--- a/third_party/chinese_text_normalization/thrax/src/cn/hotfix.list
+++ b/third_party/chinese_text_normalization/thrax/src/cn/hotfix.list
--- a/third_party/chinese_text_normalization/thrax/src/cn/itn.grm
+++ b/third_party/chinese_text_normalization/thrax/src/cn/itn.grm
--- a/third_party/chinese_text_normalization/thrax/src/cn/number.grm
+++ b/third_party/chinese_text_normalization/thrax/src/cn/number.grm
--- a/third_party/chinese_text_normalization/thrax/src/cn/percentage.grm
+++ b/third_party/chinese_text_normalization/thrax/src/cn/percentage.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/README.md
+++ b/third_party/chinese_text_normalization/thrax/src/en/README.md
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/Makefile
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/Makefile
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/cardinals.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/cardinals.tsv
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/extra_numbers.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/extra_numbers.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/factorization.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/factorization.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/float.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/float.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/g.fst
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/g.fst
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/lexical_map.tsv
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/math.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/math.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/miscellaneous.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/miscellaneous.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/money.tsv
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/number_names.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/number_names.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers_plus.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/numbers_plus.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/ordinals.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/ordinals.tsv
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/params.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/params.tsv
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/podspeech.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/podspeech.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spelled.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spelled.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spoken_punct.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/spoken_punct.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/time.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/time.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/urls.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/urls.grm
--- a/third_party/chinese_text_normalization/thrax/src/en/verbalizer/verbalizer.grm
+++ b/third_party/chinese_text_normalization/thrax/src/en/verbalizer/verbalizer.grm
--- a/third_party/chinese_text_normalization/thrax/src/number_data/README.md
+++ b/third_party/chinese_text_normalization/thrax/src/number_data/README.md
--- a/third_party/chinese_text_normalization/thrax/src/number_data/minimal.txt
+++ b/third_party/chinese_text_normalization/thrax/src/number_data/minimal.txt
--- a/third_party/chinese_text_normalization/thrax/src/number_data/random-trn.txt
+++ b/third_party/chinese_text_normalization/thrax/src/number_data/random-trn.txt
--- a/third_party/chinese_text_normalization/thrax/src/number_data/random-tst.txt
+++ b/third_party/chinese_text_normalization/thrax/src/number_data/random-tst.txt
--- a/third_party/chinese_text_normalization/thrax/src/ru/README.md
+++ b/third_party/chinese_text_normalization/thrax/src/ru/README.md
--- a/third_party/chinese_text_normalization/thrax/src/ru/classifier/cyrillic.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/classifier/cyrillic.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals-lex.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals-lex.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/cardinals.tsv
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/extra_numbers.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/extra_numbers.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/factorization.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/factorization.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/float.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/float.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/g.fst
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/g.fst
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/lexical_map.tsv
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/math.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/math.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/miscellaneous.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/miscellaneous.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/money.tsv
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/nominatives.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/nominatives.tsv
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/number_names.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/number_names.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers_plus.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/numbers_plus.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinal_endings.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinal_endings.tsv
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals-lex.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals-lex.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/ordinals.tsv
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spelled.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spelled.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spoken_punct.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/spoken_punct.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/time.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/time.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/urls.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/urls.grm
--- a/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/verbalizer.grm
+++ b/third_party/chinese_text_normalization/thrax/src/ru/verbalizer/verbalizer.grm
--- a/third_party/chinese_text_normalization/thrax/src/universal/README.md
+++ b/third_party/chinese_text_normalization/thrax/src/universal/README.md
--- a/third_party/chinese_text_normalization/thrax/src/universal/roman_numerals.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/universal/roman_numerals.tsv
--- a/third_party/chinese_text_normalization/thrax/src/universal/thousands_punct.grm
+++ b/third_party/chinese_text_normalization/thrax/src/universal/thousands_punct.grm
--- a/third_party/chinese_text_normalization/thrax/src/util/README.md
+++ b/third_party/chinese_text_normalization/thrax/src/util/README.md
--- a/third_party/chinese_text_normalization/thrax/src/util/arithmetic.grm
+++ b/third_party/chinese_text_normalization/thrax/src/util/arithmetic.grm
--- a/third_party/chinese_text_normalization/thrax/src/util/byte.grm
+++ b/third_party/chinese_text_normalization/thrax/src/util/byte.grm
--- a/third_party/chinese_text_normalization/thrax/src/util/case.grm
+++ b/third_party/chinese_text_normalization/thrax/src/util/case.grm
--- a/third_party/chinese_text_normalization/thrax/src/util/germanic.tsv
+++ b/third_party/chinese_text_normalization/thrax/src/util/germanic.tsv
--- a/third_party/chinese_text_normalization/thrax/src/util/util.grm
+++ b/third_party/chinese_text_normalization/thrax/src/util/util.grm
--- a/third_party/chinese_text_normalization/thrax/testcase_cn.txt
+++ b/third_party/chinese_text_normalization/thrax/testcase_cn.txt
--- a/third_party/chinese_text_normalization/thrax/testcase_en.txt
+++ b/third_party/chinese_text_normalization/thrax/testcase_en.txt
--- a/third_party/install.sh
+++ b/third_party/install.sh
--- a/third_party/nnAudio/.gitignore
+++ b/third_party/nnAudio/.gitignore
--- a/third_party/nnAudio/nnAudio/Spectrogram.py
+++ b/third_party/nnAudio/nnAudio/Spectrogram.py
--- a/third_party/nnAudio/nnAudio/__init__.py
+++ b/third_party/nnAudio/nnAudio/__init__.py
--- a/third_party/nnAudio/nnAudio/librosa_functions.py
+++ b/third_party/nnAudio/nnAudio/librosa_functions.py
--- a/third_party/nnAudio/nnAudio/utils.py
+++ b/third_party/nnAudio/nnAudio/utils.py
--- a/third_party/nnAudio/setup.py
+++ b/third_party/nnAudio/setup.py
--- a/third_party/nnAudio/tests/parameters.py
+++ b/third_party/nnAudio/tests/parameters.py
--- a/third_party/nnAudio/tests/test_spectrogram.py
+++ b/third_party/nnAudio/tests/test_spectrogram.py
--- a/third_party/paddle_audio/__init__.py
+++ b/third_party/paddle_audio/__init__.py
--- a/third_party/paddle_audio/frontend/common.py
+++ b/third_party/paddle_audio/frontend/common.py
--- a/third_party/paddle_audio/frontend/english.wav
+++ b/third_party/paddle_audio/frontend/english.wav
--- a/third_party/paddle_audio/frontend/kaldi.py
+++ b/third_party/paddle_audio/frontend/kaldi.py
--- a/third_party/paddle_audio/frontend/kaldi_test.py
+++ b/third_party/paddle_audio/frontend/kaldi_test.py
--- a/third_party/phkit/README.md
+++ b/third_party/phkit/README.md
--- a/third_party/phkit/phkit/__init__.py
+++ b/third_party/phkit/phkit/__init__.py
--- a/third_party/phkit/phkit/chinese/__init__.py
+++ b/third_party/phkit/phkit/chinese/__init__.py
--- a/third_party/phkit/phkit/chinese/convert.py
+++ b/third_party/phkit/phkit/chinese/convert.py
--- a/third_party/phkit/phkit/chinese/hanziconv.py
+++ b/third_party/phkit/phkit/chinese/hanziconv.py
--- a/third_party/phkit/phkit/chinese/number.py
+++ b/third_party/phkit/phkit/chinese/number.py
--- a/third_party/phkit/phkit/chinese/phoneme.py
+++ b/third_party/phkit/phkit/chinese/phoneme.py
--- a/third_party/phkit/phkit/chinese/pinyin.py
+++ b/third_party/phkit/phkit/chinese/pinyin.py
--- a/third_party/phkit/phkit/chinese/sequence.py
+++ b/third_party/phkit/phkit/chinese/sequence.py
--- a/third_party/phkit/phkit/chinese/style.py
+++ b/third_party/phkit/phkit/chinese/style.py
--- a/third_party/phkit/phkit/chinese/symbol.py
+++ b/third_party/phkit/phkit/chinese/symbol.py
--- a/third_party/phkit/phkit/english/LICENSE
+++ b/third_party/phkit/phkit/english/LICENSE
--- a/third_party/phkit/phkit/english/__init__.py
+++ b/third_party/phkit/phkit/english/__init__.py
--- a/third_party/phkit/phkit/english/cleaners.py
+++ b/third_party/phkit/phkit/english/cleaners.py
--- a/third_party/phkit/phkit/english/cmu_dictionary
+++ b/third_party/phkit/phkit/english/cmu_dictionary
--- a/third_party/phkit/phkit/english/cmudict.py
+++ b/third_party/phkit/phkit/english/cmudict.py
--- a/third_party/phkit/phkit/english/numbers.py
+++ b/third_party/phkit/phkit/english/numbers.py
--- a/third_party/phkit/phkit/english/symbols.py
+++ b/third_party/phkit/phkit/english/symbols.py
--- a/third_party/phkit/phkit/pinyinkit/__init__.py
+++ b/third_party/phkit/phkit/pinyinkit/__init__.py
--- a/third_party/phkit/requirements.txt
+++ b/third_party/phkit/requirements.txt
--- a/third_party/phkit/run_local.py
+++ b/third_party/phkit/run_local.py
--- a/third_party/phkit/setup.py
+++ b/third_party/phkit/setup.py
--- a/third_party/phkit/test.py
+++ b/third_party/phkit/test.py
--- a/third_party/pymmseg-cpp/.gitignore
+++ b/third_party/pymmseg-cpp/.gitignore
--- a/third_party/pymmseg-cpp/DESCRIPTION
+++ b/third_party/pymmseg-cpp/DESCRIPTION
--- a/third_party/pymmseg-cpp/MANIFEST.in
+++ b/third_party/pymmseg-cpp/MANIFEST.in
--- a/third_party/pymmseg-cpp/README.md
+++ b/third_party/pymmseg-cpp/README.md
--- a/third_party/pymmseg-cpp/bin/pymmseg
+++ b/third_party/pymmseg-cpp/bin/pymmseg
--- a/third_party/pymmseg-cpp/mmseg/data/chars.dic
+++ b/third_party/pymmseg-cpp/mmseg/data/chars.dic
--- a/third_party/pymmseg-cpp/mmseg/data/words.dic
+++ b/third_party/pymmseg-cpp/mmseg/data/words.dic
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/algor.cpp
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/algor.cpp
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/algor.h
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/algor.h
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/chunk.h
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/chunk.h
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/dict.cpp
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/dict.cpp
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/dict.h
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/dict.h
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/memory.cpp
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/memory.cpp
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/memory.h
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/memory.h
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/mmseg.cpp
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/mmseg.cpp
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/rules.h
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/rules.h
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/token.h
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/token.h
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/utils.h
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/utils.h
--- a/third_party/pymmseg-cpp/mmseg/mmseg-cpp/word.h
+++ b/third_party/pymmseg-cpp/mmseg/mmseg-cpp/word.h
--- a/third_party/pymmseg-cpp/setup.cfg
+++ b/third_party/pymmseg-cpp/setup.cfg
--- a/third_party/pymmseg-cpp/setup.py
+++ b/third_party/pymmseg-cpp/setup.py
--- a/third_party/pymmseg-cpp/tests/mmseg_test.py
+++ b/third_party/pymmseg-cpp/tests/mmseg_test.py
--- a/third_party/pymmseg-cpp/tests/test.sh
+++ b/third_party/pymmseg-cpp/tests/test.sh
--- a/third_party/python-pinyin/.bumpversion.cfg
+++ b/third_party/python-pinyin/.bumpversion.cfg
--- a/third_party/python-pinyin/.circleci/config.yml
+++ b/third_party/python-pinyin/.circleci/config.yml
--- a/third_party/python-pinyin/.coveragerc
+++ b/third_party/python-pinyin/.coveragerc
--- a/third_party/python-pinyin/.editorconfig
+++ b/third_party/python-pinyin/.editorconfig
--- a/third_party/python-pinyin/.flake8
+++ b/third_party/python-pinyin/.flake8
--- a/third_party/python-pinyin/.github/CONTRIBUTING.md
+++ b/third_party/python-pinyin/.github/CONTRIBUTING.md
--- a/third_party/python-pinyin/.github/ISSUE_TEMPLATE.md
+++ b/third_party/python-pinyin/.github/ISSUE_TEMPLATE.md
--- a/third_party/python-pinyin/.github/PULL_REQUEST_TEMPLATE.md
+++ b/third_party/python-pinyin/.github/PULL_REQUEST_TEMPLATE.md
--- a/third_party/python-pinyin/.github/workflows/ci.yml
+++ b/third_party/python-pinyin/.github/workflows/ci.yml
--- a/third_party/python-pinyin/.github/workflows/codeql-analysis.yml
+++ b/third_party/python-pinyin/.github/workflows/codeql-analysis.yml
--- a/third_party/python-pinyin/.gitignore
+++ b/third_party/python-pinyin/.gitignore
--- a/third_party/python-pinyin/.gitmodules
+++ b/third_party/python-pinyin/.gitmodules
--- a/third_party/python-pinyin/.pre-commit-config.yaml
+++ b/third_party/python-pinyin/.pre-commit-config.yaml
--- a/third_party/python-pinyin/.style.yapf
+++ b/third_party/python-pinyin/.style.yapf
--- a/third_party/python-pinyin/.whitesource
+++ b/third_party/python-pinyin/.whitesource
--- a/third_party/python-pinyin/CHANGELOG.rst
+++ b/third_party/python-pinyin/CHANGELOG.rst
--- a/third_party/python-pinyin/CODE_OF_CONDUCT.md
+++ b/third_party/python-pinyin/CODE_OF_CONDUCT.md
--- a/third_party/python-pinyin/LICENSE.txt
+++ b/third_party/python-pinyin/LICENSE.txt
--- a/third_party/python-pinyin/MANIFEST.in
+++ b/third_party/python-pinyin/MANIFEST.in
--- a/third_party/python-pinyin/Makefile
+++ b/third_party/python-pinyin/Makefile
--- a/third_party/python-pinyin/README.md
+++ b/third_party/python-pinyin/README.md
--- a/third_party/python-pinyin/README.rst
+++ b/third_party/python-pinyin/README.rst
--- a/third_party/python-pinyin/docs/CHANGELOG.rst
+++ b/third_party/python-pinyin/docs/CHANGELOG.rst
--- a/third_party/python-pinyin/docs/Makefile
+++ b/third_party/python-pinyin/docs/Makefile
--- a/third_party/python-pinyin/docs/api.rst
+++ b/third_party/python-pinyin/docs/api.rst
--- a/third_party/python-pinyin/docs/conf.py
+++ b/third_party/python-pinyin/docs/conf.py
--- a/third_party/python-pinyin/docs/contrib.rst
+++ b/third_party/python-pinyin/docs/contrib.rst
--- a/third_party/python-pinyin/docs/develop.rst
+++ b/third_party/python-pinyin/docs/develop.rst
--- a/third_party/python-pinyin/docs/faq.rst
+++ b/third_party/python-pinyin/docs/faq.rst
--- a/third_party/python-pinyin/docs/index.rst
+++ b/third_party/python-pinyin/docs/index.rst
--- a/third_party/python-pinyin/docs/installation.rst
+++ b/third_party/python-pinyin/docs/installation.rst
--- a/third_party/python-pinyin/docs/make.bat
+++ b/third_party/python-pinyin/docs/make.bat
--- a/third_party/python-pinyin/docs/related.rst
+++ b/third_party/python-pinyin/docs/related.rst
--- a/third_party/python-pinyin/docs/usage.rst
+++ b/third_party/python-pinyin/docs/usage.rst
--- a/third_party/python-pinyin/gen_phrases_dict.py
+++ b/third_party/python-pinyin/gen_phrases_dict.py
--- a/third_party/python-pinyin/gen_pinyin_dict.py
+++ b/third_party/python-pinyin/gen_pinyin_dict.py
--- a/third_party/python-pinyin/phrase-pinyin-data/.bumpversion.cfg
+++ b/third_party/python-pinyin/phrase-pinyin-data/.bumpversion.cfg
--- a/third_party/python-pinyin/phrase-pinyin-data/.gitignore
+++ b/third_party/python-pinyin/phrase-pinyin-data/.gitignore
--- a/third_party/python-pinyin/phrase-pinyin-data/.travis.yml
+++ b/third_party/python-pinyin/phrase-pinyin-data/.travis.yml
--- a/third_party/python-pinyin/phrase-pinyin-data/CHANGELOG.md
+++ b/third_party/python-pinyin/phrase-pinyin-data/CHANGELOG.md
--- a/third_party/python-pinyin/phrase-pinyin-data/LICENSE
+++ b/third_party/python-pinyin/phrase-pinyin-data/LICENSE
--- a/third_party/python-pinyin/phrase-pinyin-data/Makefile
+++ b/third_party/python-pinyin/phrase-pinyin-data/Makefile
--- a/third_party/python-pinyin/phrase-pinyin-data/README.md
+++ b/third_party/python-pinyin/phrase-pinyin-data/README.md
--- a/third_party/python-pinyin/phrase-pinyin-data/cc_cedict.txt
+++ b/third_party/python-pinyin/phrase-pinyin-data/cc_cedict.txt
--- a/third_party/python-pinyin/phrase-pinyin-data/get_latest_cc_cedict.py
+++ b/third_party/python-pinyin/phrase-pinyin-data/get_latest_cc_cedict.py
--- a/third_party/python-pinyin/phrase-pinyin-data/large_pinyin.txt
+++ b/third_party/python-pinyin/phrase-pinyin-data/large_pinyin.txt
--- a/third_party/python-pinyin/phrase-pinyin-data/merge.py
+++ b/third_party/python-pinyin/phrase-pinyin-data/merge.py
--- a/third_party/python-pinyin/phrase-pinyin-data/overwrite.txt
+++ b/third_party/python-pinyin/phrase-pinyin-data/overwrite.txt
--- a/third_party/python-pinyin/phrase-pinyin-data/parse_latest_cc_cedict.py
+++ b/third_party/python-pinyin/phrase-pinyin-data/parse_latest_cc_cedict.py
--- a/third_party/python-pinyin/phrase-pinyin-data/pinyin.txt
+++ b/third_party/python-pinyin/phrase-pinyin-data/pinyin.txt
--- a/third_party/python-pinyin/phrase-pinyin-data/requirements_dev.txt
+++ b/third_party/python-pinyin/phrase-pinyin-data/requirements_dev.txt
--- a/third_party/python-pinyin/phrase-pinyin-data/zdic_cibs.txt
+++ b/third_party/python-pinyin/phrase-pinyin-data/zdic_cibs.txt
--- a/third_party/python-pinyin/phrase-pinyin-data/zdic_cybs.txt
+++ b/third_party/python-pinyin/phrase-pinyin-data/zdic_cybs.txt
--- a/third_party/python-pinyin/pinyin-data/.bumpversion.cfg
+++ b/third_party/python-pinyin/pinyin-data/.bumpversion.cfg
--- a/third_party/python-pinyin/pinyin-data/.github/workflows/python-app.yml
+++ b/third_party/python-pinyin/pinyin-data/.github/workflows/python-app.yml
--- a/third_party/python-pinyin/pinyin-data/.gitignore
+++ b/third_party/python-pinyin/pinyin-data/.gitignore
--- a/third_party/python-pinyin/pinyin-data/.travis.yml
+++ b/third_party/python-pinyin/pinyin-data/.travis.yml
--- a/third_party/python-pinyin/pinyin-data/CHANGELOG.md
+++ b/third_party/python-pinyin/pinyin-data/CHANGELOG.md
--- a/third_party/python-pinyin/pinyin-data/GBK_PUA.txt
+++ b/third_party/python-pinyin/pinyin-data/GBK_PUA.txt
--- a/third_party/python-pinyin/pinyin-data/LICENSE
+++ b/third_party/python-pinyin/pinyin-data/LICENSE
--- a/third_party/python-pinyin/pinyin-data/Makefile
+++ b/third_party/python-pinyin/pinyin-data/Makefile
--- a/third_party/python-pinyin/pinyin-data/README.md
+++ b/third_party/python-pinyin/pinyin-data/README.md
--- a/third_party/python-pinyin/pinyin-data/kHanyuPinlu.txt
+++ b/third_party/python-pinyin/pinyin-data/kHanyuPinlu.txt
--- a/third_party/python-pinyin/pinyin-data/kHanyuPinyin.txt
+++ b/third_party/python-pinyin/pinyin-data/kHanyuPinyin.txt
--- a/third_party/python-pinyin/pinyin-data/kMandarin.txt
+++ b/third_party/python-pinyin/pinyin-data/kMandarin.txt
--- a/third_party/python-pinyin/pinyin-data/kMandarin_8105.txt
+++ b/third_party/python-pinyin/pinyin-data/kMandarin_8105.txt
--- a/third_party/python-pinyin/pinyin-data/kMandarin_overwrite.txt
+++ b/third_party/python-pinyin/pinyin-data/kMandarin_overwrite.txt
--- a/third_party/python-pinyin/pinyin-data/kTGHZ2013.txt
+++ b/third_party/python-pinyin/pinyin-data/kTGHZ2013.txt