Merge pull request #754 from PaddlePaddle/develop

release 2.1.1

Merge pull request #754 from PaddlePaddle/develop
release 2.1.1
fa34cdf1 · Hui Zhang · GitHub · 5ef4a34e · 0309c36a · fa34cdf1
305 changed file
--- a/.bashrc
+++ b/.bashrc
+unset GREP_OPTIONS
+
+# https://zhuanlan.zhihu.com/p/33050965
+alias nvs='nvidia-smi'
+alias his='history'
+alias jobs='jobs -l'
+alias ports='netstat -tulanp'
+alias wget='wget -c'
+
+## Colorize the grep command output for ease of use (good for log files)##
+alias grep='grep --color=auto'
+alias egrep='egrep --color=auto'
+alias fgrep='fgrep --color=auto'
+
+
--- a/.gitignore
+++ b/.gitignore
@@ -10,8 +10,13 @@
 .ipynb_checkpoints
 *.npz
 *.done
+*.whl

 tools/venv
 tools/kenlm
 tools/sox-14.4.2
 tools/soxbindings
+tools/montreal-forced-aligner/
+tools/Montreal-Forced-Aligner/
+
+*output/
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -87,3 +87,9 @@ pull_request_rules:
    actions:
      label:
        add: ["Docker"]
+  - name: "auto add label=Deployment"
+    conditions:
+      - files~=^speechnn/
+    actions:
+      label:
+        add: ["Deployment"]
--- a/.notebook/audio_feature.ipynb
+++ b/.notebook/audio_feature.ipynb
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 ## Setup

 * python>=3.7
-* paddlepaddle>=2.1.0
+* paddlepaddle>=2.1.2

 Please see [install](doc/src/install.md).


--- a/README_cn.md
+++ b/README_cn.md
@@ -17,7 +17,7 @@
 ## 安装

 * python>=3.7
-* paddlepaddle>=2.1.0
+* paddlepaddle>=2.1.2

 参看 [安装](doc/src/install.md)。


--- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
@@ -18,8 +18,10 @@ import numpy as np
 import paddle
 from paddle.inference import Config
 from paddle.inference import create_predictor
+from paddle.io import DataLoader

 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.deepspeech2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
@@ -78,26 +80,31 @@ def inference(config, args):
 def start_server(config, args):
    """Start the ASR server"""
    config.defrost()
-    config.data.manfiest = config.data.test_manifest
-    config.data.augmentation_config = ""
-    config.data.keep_transcription_text = True
+    config.data.manifest = config.data.test_manifest
    dataset = ManifestDataset.from_config(config)

-    model = DeepSpeech2Model.from_pretrained(dataset, config,
+    config.collator.augmentation_config = ""
+    config.collator.keep_transcription_text = True
+    config.collator.batch_size = 1
+    config.collator.num_workers = 0
+    collate_fn = SpeechCollator.from_config(config)
+    test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
+
+    model = DeepSpeech2Model.from_pretrained(test_loader, config,
                                             args.checkpoint_path)
    model.eval()

    # prepare ASR inference handler
    def file_to_transcript(filename):
-        feature = dataset.process_utterance(filename, "")
-        audio = np.array([feature[0]]).astype('float32')  #[1, D, T]
-        audio_len = feature[0].shape[1]
+        feature = test_loader.collate_fn.process_utterance(filename, "")
+        audio = np.array([feature[0]]).astype('float32')  #[1, T, D]
+        audio_len = feature[0].shape[0]
        audio_len = np.array([audio_len]).astype('int64')  # [1]

        result_transcript = model.decode(
            paddle.to_tensor(audio),
            paddle.to_tensor(audio_len),
-            vocab_list=dataset.vocab_list,
+            vocab_list=test_loader.collate_fn.vocab_list,
            decoding_method=config.decoding.decoding_method,
            lang_model_path=config.decoding.lang_model_path,
            beam_alpha=config.decoding.alpha,
@@ -138,7 +145,7 @@ if __name__ == "__main__":
    add_arg('host_ip',          str,
            'localhost',
            "Server's IP address.")
-    add_arg('host_port',        int,    8086,    "Server's IP port.")
+    add_arg('host_port',        int,    8089,    "Server's IP port.")
    add_arg('speech_save_dir',  str,
            'demo_cache',
            "Directory to save demo audios.")

--- a/deepspeech/exps/deepspeech2/bin/deploy/server.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py
@@ -16,8 +16,10 @@ import functools

 import numpy as np
 import paddle
+from paddle.io import DataLoader

 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
+from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.deepspeech2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
@@ -31,26 +33,35 @@ from deepspeech.utils.utility import print_arguments
 def start_server(config, args):
    """Start the ASR server"""
    config.defrost()
-    config.data.manfiest = config.data.test_manifest
-    config.data.augmentation_config = ""
-    config.data.keep_transcription_text = True
+    config.data.manifest = config.data.test_manifest
    dataset = ManifestDataset.from_config(config)

-    model = DeepSpeech2Model.from_pretrained(dataset, config,
+    config.collator.augmentation_config = ""
+    config.collator.keep_transcription_text = True
+    config.collator.batch_size = 1
+    config.collator.num_workers = 0
+    collate_fn = SpeechCollator.from_config(config)
+    test_loader = DataLoader(dataset, collate_fn=collate_fn, num_workers=0)
+
+    model = DeepSpeech2Model.from_pretrained(test_loader, config,
                                             args.checkpoint_path)
    model.eval()

    # prepare ASR inference handler
    def file_to_transcript(filename):
-        feature = dataset.process_utterance(filename, "")
-        audio = np.array([feature[0]]).astype('float32')  #[1, D, T]
-        audio_len = feature[0].shape[1]
+        feature = test_loader.collate_fn.process_utterance(filename, "")
+        audio = np.array([feature[0]]).astype('float32')  #[1, T, D]
+        # audio = audio.swapaxes(1,2)
+        print('---file_to_transcript feature----')
+        print(audio.shape)
+        audio_len = feature[0].shape[0]
+        print(audio_len)
        audio_len = np.array([audio_len]).astype('int64')  # [1]

        result_transcript = model.decode(
            paddle.to_tensor(audio),
            paddle.to_tensor(audio_len),
-            vocab_list=dataset.vocab_list,
+            vocab_list=test_loader.collate_fn.vocab_list,
            decoding_method=config.decoding.decoding_method,
            lang_model_path=config.decoding.lang_model_path,
            beam_alpha=config.decoding.alpha,
@@ -91,7 +102,7 @@ if __name__ == "__main__":
    add_arg('host_ip',          str,
            'localhost',
            "Server's IP address.")
-    add_arg('host_port',        int,    8086,    "Server's IP port.")
+    add_arg('host_port',        int,    8088,    "Server's IP port.")
    add_arg('speech_save_dir',  str,
            'demo_cache',
            "Directory to save demo audios.")

--- a/deepspeech/exps/deepspeech2/bin/export.py
+++ b/deepspeech/exps/deepspeech2/bin/export.py
@@ -30,11 +30,15 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    parser.add_argument("--model_type")
    args = parser.parse_args()
+    if args.model_type is None:
+        args.model_type = 'offline'
+    print("model_type:{}".format(args.model_type))
    print_arguments(args)

    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = get_cfg_defaults(args.model_type)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:

--- a/deepspeech/exps/deepspeech2/bin/test.py
+++ b/deepspeech/exps/deepspeech2/bin/test.py
@@ -30,11 +30,15 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    parser.add_argument("--model_type")
    args = parser.parse_args()
    print_arguments(args, globals())
+    if args.model_type is None:
+        args.model_type = 'offline'
+    print("model_type:{}".format(args.model_type))

    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = get_cfg_defaults(args.model_type)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:

--- a/deepspeech/exps/deepspeech2/bin/train.py
+++ b/deepspeech/exps/deepspeech2/bin/train.py
@@ -35,11 +35,15 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    parser.add_argument("--model_type")
    args = parser.parse_args()
+    if args.model_type is None:
+        args.model_type = 'offline'
+    print("model_type:{}".format(args.model_type))
    print_arguments(args, globals())

    # https://yaml.org/type/float.html
-    config = get_cfg_defaults()
+    config = get_cfg_defaults(args.model_type)
    if args.config:
        config.merge_from_file(args.config)
    if args.opts:

--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ b/deepspeech/exps/deepspeech2/bin/tune.py
@@ -47,7 +47,7 @@ def tune(config, args):
        drop_last=False,
        collate_fn=SpeechCollator(keep_transcription_text=True))

-    model = DeepSpeech2Model.from_pretrained(dev_dataset, config,
+    model = DeepSpeech2Model.from_pretrained(valid_loader, config,
                                             args.checkpoint_path)
    model.eval()


--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@@ -11,77 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from yacs.config import CfgNode as CN
-
-from deepspeech.models.deepspeech2 import DeepSpeech2Model
-
-_C = CN()
-_C.data = CN(
-    dict(
-        train_manifest="",
-        dev_manifest="",
-        test_manifest="",
-        unit_type="char",
-        vocab_filepath="",
-        spm_model_prefix="",
-        mean_std_filepath="",
-        augmentation_config="",
-        max_duration=float('inf'),
-        min_duration=0.0,
-        stride_ms=10.0,  # ms
-        window_ms=20.0,  # ms
-        n_fft=None,  # fft points
-        max_freq=None,  # None for samplerate/2
-        specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-        feat_dim=0,  # 'mfcc', 'fbank'
-        delat_delta=False,  # 'mfcc', 'fbank'
-        target_sample_rate=16000,  # target sample rate
-        use_dB_normalization=True,
-        target_dB=-20,
-        random_seed=0,
-        keep_transcription_text=False,
-        batch_size=32,  # batch size
-        num_workers=0,  # data loader workers
-        sortagrad=False,  # sorted in first epoch when True
-        shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
-    ))
-
-_C.model = CN(
-    dict(
-        num_conv_layers=2,  #Number of stacking convolution layers.
-        num_rnn_layers=3,  #Number of stacking RNN layers.
-        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-    ))
-
-DeepSpeech2Model.params(_C.model)
-
-_C.training = CN(
-    dict(
-        lr=5e-4,  # learning rate
-        lr_decay=1.0,  # learning rate decay
-        weight_decay=1e-6,  # the coeff of weight decay
-        global_grad_clip=5.0,  # the global norm clip
-        n_epoch=50,  # train epochs
-    ))
-
-_C.decoding = CN(
-    dict(
-        alpha=2.5,  # Coef of LM for beam search.
-        beta=0.3,  # Coef of WC for beam search.
-        cutoff_prob=1.0,  # Cutoff probability for pruning.
-        cutoff_top_n=40,  # Cutoff number for pruning.
-        lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
-        decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
-        error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
-        num_proc_bsearch=8,  # # of CPUs for beam search.
-        beam_size=500,  # Beam search width.
-        batch_size=128,  # decoding batch size
-    ))
-
-
-def get_cfg_defaults():
+from yacs.config import CfgNode
+
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Tester
+from deepspeech.exps.deepspeech2.model import DeepSpeech2Trainer
+from deepspeech.io.collator import SpeechCollator
+from deepspeech.io.dataset import ManifestDataset
+from deepspeech.models.ds2 import DeepSpeech2Model
+from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
+
+
+def get_cfg_defaults(model_type='offline'):
+    _C = CfgNode()
+    _C.data = ManifestDataset.params()
+    _C.collator = SpeechCollator.params()
+    _C.training = DeepSpeech2Trainer.params()
+    _C.decoding = DeepSpeech2Tester.params()
+    if model_type == 'offline':
+        _C.model = DeepSpeech2Model.params()
+    else:
+        _C.model = DeepSpeech2ModelOnline.params()
    """Get a yacs CfgNode object with default values for my_project."""
    # Return a clone so that the defaults will not be altered
    # This is for the "local variable" use pattern

--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -11,39 +11,61 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Contains DeepSpeech2 model."""
+"""Contains DeepSpeech2 and DeepSpeech2Online model."""
 import time
 from collections import defaultdict
 from pathlib import Path
+from typing import Optional

 import numpy as np
 import paddle
 from paddle import distributed as dist
 from paddle.io import DataLoader
+from yacs.config import CfgNode

 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
-from deepspeech.models.deepspeech2 import DeepSpeech2InferModel
-from deepspeech.models.deepspeech2 import DeepSpeech2Model
+from deepspeech.models.ds2 import DeepSpeech2InferModel
+from deepspeech.models.ds2 import DeepSpeech2Model
+from deepspeech.models.ds2_online import DeepSpeech2InferModelOnline
+from deepspeech.models.ds2_online import DeepSpeech2ModelOnline
 from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
+from deepspeech.utils.log import Autolog
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()


 class DeepSpeech2Trainer(Trainer):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        # training config
+        default = CfgNode(
+            dict(
+                lr=5e-4,  # learning rate
+                lr_decay=1.0,  # learning rate decay
+                weight_decay=1e-6,  # the coeff of weight decay
+                global_grad_clip=5.0,  # the global norm clip
+                n_epoch=50,  # train epochs
+            ))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
    def __init__(self, config, args):
        super().__init__(config, args)

    def train_batch(self, batch_index, batch_data, msg):
        start = time.time()
-        loss = self.model(*batch_data)
+        utt, audio, audio_len, text, text_len = batch_data
+        loss = self.model(audio, audio_len, text, text_len)
        loss.backward()
        layer_tools.print_grads(self.model, print_func=None)
        self.optimizer.step()
@@ -54,7 +76,7 @@ class DeepSpeech2Trainer(Trainer):
            'train_loss': float(loss),
        }
        msg += "train time: {:>.3f}s, ".format(iteration_time)
-        msg += "batch size: {}, ".format(self.config.data.batch_size)
+        msg += "batch size: {}, ".format(self.config.collator.batch_size)
        msg += ', '.join('{}: {:>.6f}'.format(k, v)
                         for k, v in losses_np.items())
        logger.info(msg)
@@ -73,9 +95,10 @@ class DeepSpeech2Trainer(Trainer):
        num_seen_utts = 1
        total_loss = 0.0
        for i, batch in enumerate(self.valid_loader):
-            loss = self.model(*batch)
+            utt, audio, audio_len, text, text_len = batch
+            loss = self.model(audio, audio_len, text, text_len)
            if paddle.isfinite(loss):
-                num_utts = batch[0].shape[0]
+                num_utts = batch[1].shape[0]
                num_seen_utts += num_utts
                total_loss += float(loss) * num_utts
                valid_losses['val_loss'].append(float(loss))
@@ -98,16 +121,18 @@ class DeepSpeech2Trainer(Trainer):
        return total_loss, num_seen_utts

    def setup_model(self):
-        config = self.config
-        model = DeepSpeech2Model(
-            feat_size=self.train_loader.dataset.feature_size,
-            dict_size=self.train_loader.dataset.vocab_size,
-            num_conv_layers=config.model.num_conv_layers,
-            num_rnn_layers=config.model.num_rnn_layers,
-            rnn_size=config.model.rnn_layer_size,
-            use_gru=config.model.use_gru,
-            share_rnn_weights=config.model.share_rnn_weights)
-
+        config = self.config.clone()
+        config.defrost()
+        config.model.feat_size = self.train_loader.collate_fn.feature_size
+        config.model.dict_size = self.train_loader.collate_fn.vocab_size
+        config.freeze()
+
+        if self.args.model_type == 'offline':
+            model = DeepSpeech2Model.from_config(config.model)
+        elif self.args.model_type == 'online':
+            model = DeepSpeech2ModelOnline.from_config(config.model)
+        else:
+            raise Exception("wrong model type")
        if self.parallel:
            model = paddle.DataParallel(model)

@@ -135,50 +160,87 @@ class DeepSpeech2Trainer(Trainer):
    def setup_dataloader(self):
        config = self.config.clone()
        config.defrost()
-        config.data.keep_transcription_text = False
+        config.collator.keep_transcription_text = False

        config.data.manifest = config.data.train_manifest
        train_dataset = ManifestDataset.from_config(config)

        config.data.manifest = config.data.dev_manifest
-        config.data.augmentation_config = ""
        dev_dataset = ManifestDataset.from_config(config)

+        config.data.manifest = config.data.test_manifest
+        test_dataset = ManifestDataset.from_config(config)
+
        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
+
+        collate_fn_train = SpeechCollator.from_config(config)
+
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
+
+        config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
+        collate_fn_test = SpeechCollator.from_config(config)

-        collate_fn = SpeechCollator(keep_transcription_text=False)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
-            num_workers=config.data.num_workers)
+            collate_fn=collate_fn_train,
+            num_workers=config.collator.num_workers)
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn)
-        logger.info("Setup train/valid Dataloader!")
+            collate_fn=collate_fn_dev)
+        self.test_loader = DataLoader(
+            test_dataset,
+            batch_size=config.decoding.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=collate_fn_test)
+        logger.info("Setup train/valid/test  Dataloader!")


 class DeepSpeech2Tester(DeepSpeech2Trainer):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        # testing config
+        default = CfgNode(
+            dict(
+                alpha=2.5,  # Coef of LM for beam search.
+                beta=0.3,  # Coef of WC for beam search.
+                cutoff_prob=1.0,  # Cutoff probability for pruning.
+                cutoff_top_n=40,  # Cutoff number for pruning.
+                lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm',  # Filepath for language model.
+                decoding_method='ctc_beam_search',  # Decoding method. Options: ctc_beam_search, ctc_greedy
+                error_rate_type='wer',  # Error rate type for evaluation. Options `wer`, 'cer'
+                num_proc_bsearch=8,  # # of CPUs for beam search.
+                beam_size=500,  # Beam search width.
+                batch_size=128,  # decoding batch size
+            ))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
    def __init__(self, config, args):
        super().__init__(config, args)

@@ -191,15 +253,23 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            trans.append(''.join([chr(i) for i in ids]))
        return trans

-    def compute_metrics(self, audio, audio_len, texts, texts_len):
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
        cfg = self.config.decoding
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer

-        vocab_list = self.test_loader.dataset.vocab_list
+        vocab_list = self.test_loader.collate_fn.vocab_list

        target_transcripts = self.ordid2token(texts, texts_len)
+        self.autolog.times.start()
+        self.autolog.times.stamp()
        result_transcripts = self.model.decode(
            audio,
            audio_len,
@@ -212,12 +282,18 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            cutoff_prob=cfg.cutoff_prob,
            cutoff_top_n=cfg.cutoff_top_n,
            num_processes=cfg.num_proc_bsearch)
+        self.autolog.times.stamp()
+        self.autolog.times.stamp()
+        self.autolog.times.end()

-        for target, result in zip(target_transcripts, result_transcripts):
+        for utt, target, result in zip(utts, target_transcripts,
+                                       result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
            num_ins += 1
+            if fout:
+                fout.write(utt + " " + result + "\n")
            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
                        (target, result))
            logger.info("Current error rate [%s] = %f" %
@@ -234,19 +310,25 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    @paddle.no_grad()
    def test(self):
        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")
+        self.autolog = Autolog(
+            batch_size=self.config.decoding.batch_size,
+            model_name="deepspeech2",
+            model_precision="fp32").getlog()
        self.model.eval()
        cfg = self.config
        error_rate_type = None
        errors_sum, len_refs, num_ins = 0.0, 0, 0
-
-        for i, batch in enumerate(self.test_loader):
-            metrics = self.compute_metrics(*batch)
-            errors_sum += metrics['errors_sum']
-            len_refs += metrics['len_refs']
-            num_ins += metrics['num_ins']
-            error_rate_type = metrics['error_rate_type']
-            logger.info("Error rate [%s] (%d/?) = %f" %
-                        (error_rate_type, num_ins, errors_sum / len_refs))
+        with open(self.args.result_file, 'w') as fout:
+            for i, batch in enumerate(self.test_loader):
+                utts, audio, audio_len, texts, texts_len = batch
+                metrics = self.compute_metrics(utts, audio, audio_len, texts,
+                                               texts_len, fout)
+                errors_sum += metrics['errors_sum']
+                len_refs += metrics['len_refs']
+                num_ins += metrics['num_ins']
+                error_rate_type = metrics['error_rate_type']
+                logger.info("Error rate [%s] (%d/?) = %f" %
+                            (error_rate_type, num_ins, errors_sum / len_refs))

        # logging
        msg = "Test: "
@@ -255,6 +337,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        msg += "Final error rate [%s] (%d/%d) = %f" % (
            error_rate_type, num_ins, num_ins, errors_sum / len_refs)
        logger.info(msg)
+        self.autolog.report()

    def run_test(self):
        self.resume_or_scratch()
@@ -264,19 +347,18 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            exit(-1)

    def export(self):
-        infer_model = DeepSpeech2InferModel.from_pretrained(
-            self.test_loader.dataset, self.config, self.args.checkpoint_path)
+        if self.args.model_type == 'offline':
+            infer_model = DeepSpeech2InferModel.from_pretrained(
+                self.test_loader, self.config, self.args.checkpoint_path)
+        elif self.args.model_type == 'online':
+            infer_model = DeepSpeech2InferModelOnline.from_pretrained(
+                self.test_loader, self.config, self.args.checkpoint_path)
+        else:
+            raise Exception("wrong model type")
+
        infer_model.eval()
-        feat_dim = self.test_loader.dataset.feature_size
-        static_model = paddle.jit.to_static(
-            infer_model,
-            input_spec=[
-                paddle.static.InputSpec(
-                    shape=[None, None, feat_dim],
-                    dtype='float32'),  # audio, [B,T,D]
-                paddle.static.InputSpec(shape=[None],
-                                        dtype='int64'),  # audio_length, [B]
-            ])
+        feat_dim = self.test_loader.collate_fn.feature_size
+        static_model = infer_model.export()
        logger.info(f"Export code: {static_model.forward.code}")
        paddle.jit.save(static_model, self.args.export_path)

@@ -300,46 +382,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        self.iteration = 0
        self.epoch = 0

-    def setup_model(self):
-        config = self.config
-        model = DeepSpeech2Model(
-            feat_size=self.test_loader.dataset.feature_size,
-            dict_size=self.test_loader.dataset.vocab_size,
-            num_conv_layers=config.model.num_conv_layers,
-            num_rnn_layers=config.model.num_rnn_layers,
-            rnn_size=config.model.rnn_layer_size,
-            use_gru=config.model.use_gru,
-            share_rnn_weights=config.model.share_rnn_weights)
-        self.model = model
-        logger.info("Setup model!")
-
-    def setup_dataloader(self):
-        config = self.config.clone()
-        config.defrost()
-        # return raw text
-
-        config.data.manifest = config.data.test_manifest
-        config.data.keep_transcription_text = True
-        config.data.augmentation_config = ""
-        # filter test examples, will cause less examples, but no mismatch with training
-        # and can use large batch size , save training time, so filter test egs now.
-        # config.data.min_input_len = 0.0  # second
-        # config.data.max_input_len = float('inf')  # second
-        # config.data.min_output_len = 0.0  # tokens
-        # config.data.max_output_len = float('inf')  # tokens
-        # config.data.min_output_input_ratio = 0.00
-        # config.data.max_output_input_ratio = float('inf')
-        test_dataset = ManifestDataset.from_config(config)
-
-        # return text ord id
-        self.test_loader = DataLoader(
-            test_dataset,
-            batch_size=config.decoding.batch_size,
-            shuffle=False,
-            drop_last=False,
-            collate_fn=SpeechCollator(keep_transcription_text=True))
-        logger.info("Setup test Dataloader!")
-
    def setup_output_dir(self):
        """Create a directory used for output.
        """

--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@@ -15,6 +15,7 @@ from yacs.config import CfgNode

 from deepspeech.exps.u2.model import U2Tester
 from deepspeech.exps.u2.model import U2Trainer
+from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2 import U2Model

@@ -22,6 +23,8 @@ _C = CfgNode()

 _C.data = ManifestDataset.params()

+_C.collator = SpeechCollator.params()
+
 _C.model = U2Model.params()

 _C.training = U2Trainer.params()

--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -31,12 +31,15 @@ from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Model
-from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
-from deepspeech.training.scheduler import WarmupLR
+from deepspeech.training.optimizer import OptimizerFactory
+from deepspeech.training.scheduler import LRSchedulerFactory
 from deepspeech.training.trainer import Trainer
+from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
 from deepspeech.utils import layer_tools
 from deepspeech.utils import mp_tools
+from deepspeech.utils import text_grid
+from deepspeech.utils import utility
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()
@@ -76,8 +79,10 @@ class U2Trainer(Trainer):
    def train_batch(self, batch_index, batch_data, msg):
        train_conf = self.config.training
        start = time.time()
+        utt, audio, audio_len, text, text_len = batch_data

-        loss, attention_loss, ctc_loss = self.model(*batch_data)
+        loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                    text_len)
        # loss div by `batch_size * accum_grad`
        loss /= train_conf.accum_grad
        loss.backward()
@@ -99,7 +104,7 @@ class U2Trainer(Trainer):

        if (batch_index + 1) % train_conf.log_interval == 0:
            msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.data.batch_size)
+            msg += "batch size: {}, ".format(self.config.collator.batch_size)
            msg += "accum: {}, ".format(train_conf.accum_grad)
            msg += ', '.join('{}: {:>.6f}'.format(k, v)
                             for k, v in losses_np.items())
@@ -119,9 +124,11 @@ class U2Trainer(Trainer):
        num_seen_utts = 1
        total_loss = 0.0
        for i, batch in enumerate(self.valid_loader):
-            loss, attention_loss, ctc_loss = self.model(*batch)
+            utt, audio, audio_len, text, text_len = batch
+            loss, attention_loss, ctc_loss = self.model(audio, audio_len, text,
+                                                        text_len)
            if paddle.isfinite(loss):
-                num_utts = batch[0].shape[0]
+                num_utts = batch[1].shape[0]
                num_seen_utts += num_utts
                total_loss += float(loss) * num_utts
                valid_losses['val_loss'].append(float(loss))
@@ -209,51 +216,52 @@ class U2Trainer(Trainer):
    def setup_dataloader(self):
        config = self.config.clone()
        config.defrost()
-        config.data.keep_transcription_text = False
+        config.collator.keep_transcription_text = False

        # train/valid dataset, return token ids
        config.data.manifest = config.data.train_manifest
        train_dataset = ManifestDataset.from_config(config)

        config.data.manifest = config.data.dev_manifest
-        config.data.augmentation_config = ""
        dev_dataset = ManifestDataset.from_config(config)

-        collate_fn = SpeechCollator(keep_transcription_text=False)
+        collate_fn_train = SpeechCollator.from_config(config)
+
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
+
        if self.parallel:
            batch_sampler = SortagradDistributedBatchSampler(
                train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                num_replicas=None,
                rank=None,
                shuffle=True,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
        else:
            batch_sampler = SortagradBatchSampler(
                train_dataset,
                shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
-            num_workers=config.data.num_workers, )
+            collate_fn=collate_fn_train,
+            num_workers=config.collator.num_workers, )
        self.valid_loader = DataLoader(
            dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)

        # test dataset, return raw text
        config.data.manifest = config.data.test_manifest
-        config.data.keep_transcription_text = True
-        config.data.augmentation_config = ""
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
        # config.data.min_input_len = 0.0  # second
@@ -262,22 +270,33 @@ class U2Trainer(Trainer):
        # config.data.max_output_len = float('inf')  # tokens
        # config.data.min_output_input_ratio = 0.00
        # config.data.max_output_input_ratio = float('inf')
+
        test_dataset = ManifestDataset.from_config(config)
        # return text ord id
+        config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
        self.test_loader = DataLoader(
            test_dataset,
            batch_size=config.decoding.batch_size,
            shuffle=False,
            drop_last=False,
-            collate_fn=SpeechCollator(keep_transcription_text=True))
-        logger.info("Setup train/valid/test Dataloader!")
+            collate_fn=SpeechCollator.from_config(config))
+        # return text token id
+        config.collator.keep_transcription_text = False
+        self.align_loader = DataLoader(
+            test_dataset,
+            batch_size=config.decoding.batch_size,
+            shuffle=False,
+            drop_last=False,
+            collate_fn=SpeechCollator.from_config(config))
+        logger.info("Setup train/valid/test/align Dataloader!")

    def setup_model(self):
        config = self.config
        model_conf = config.model
        model_conf.defrost()
-        model_conf.input_dim = self.train_loader.dataset.feature_size
-        model_conf.output_dim = self.train_loader.dataset.vocab_size
+        model_conf.input_dim = self.train_loader.collate_fn.feature_size
+        model_conf.output_dim = self.train_loader.collate_fn.vocab_size
        model_conf.freeze()
        model = U2Model.from_config(model_conf)

@@ -293,30 +312,38 @@ class U2Trainer(Trainer):
        scheduler_type = train_config.scheduler
        scheduler_conf = train_config.scheduler_conf

-        grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip)
-        weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay)
-
-        if scheduler_type == 'expdecaylr':
-            lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
-                learning_rate=optim_conf.lr,
-                gamma=scheduler_conf.lr_decay,
-                verbose=False)
-        elif scheduler_type == 'warmuplr':
-            lr_scheduler = WarmupLR(
-                learning_rate=optim_conf.lr,
-                warmup_steps=scheduler_conf.warmup_steps,
-                verbose=False)
-        else:
-            raise ValueError(f"Not support scheduler: {scheduler_type}")
-
-        if optim_type == 'adam':
-            optimizer = paddle.optimizer.Adam(
-                learning_rate=lr_scheduler,
-                parameters=model.parameters(),
-                weight_decay=weight_decay,
-                grad_clip=grad_clip)
-        else:
-            raise ValueError(f"Not support optim: {optim_type}")
+        scheduler_args = {
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
+            "warmup_steps": scheduler_conf.warmup_steps,
+            "gamma": scheduler_conf.lr_decay,
+            "d_model": model_conf.encoder_conf.output_size,
+        }
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
+                                                    scheduler_args)
+
+        def optimizer_args(
+                config,
+                parameters,
+                lr_scheduler=None, ):
+            train_config = config.training
+            optim_type = train_config.optim
+            optim_conf = train_config.optim_conf
+            scheduler_type = train_config.scheduler
+            scheduler_conf = train_config.scheduler_conf
+            return {
+                "grad_clip": train_config.global_grad_clip,
+                "weight_decay": optim_conf.weight_decay,
+                "learning_rate": lr_scheduler
+                if lr_scheduler else optim_conf.lr,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)

        self.model = model
        self.optimizer = optimizer
@@ -345,7 +372,7 @@ class U2Tester(U2Trainer):
                decoding_chunk_size=-1,  # decoding chunk size. Defaults to -1.
                # <0: for decoding, use full chunk.
                # >0: for decoding, use fixed chunk size as set.
-                # 0: used for training, it's prohibited here. 
+                # 0: used for training, it's prohibited here.
                num_decoding_left_chunks=-1,  # number of left chunks for decoding. Defaults to -1.
                simulate_streaming=False,  # simulate streaming inference. Defaults to False.
            ))
@@ -366,14 +393,20 @@ class U2Tester(U2Trainer):
            trans.append(''.join([chr(i) for i in ids]))
        return trans

-    def compute_metrics(self, audio, audio_len, texts, texts_len, fout=None):
+    def compute_metrics(self,
+                        utts,
+                        audio,
+                        audio_len,
+                        texts,
+                        texts_len,
+                        fout=None):
        cfg = self.config.decoding
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        errors_func = error_rate.char_errors if cfg.error_rate_type == 'cer' else error_rate.word_errors
        error_rate_func = error_rate.cer if cfg.error_rate_type == 'cer' else error_rate.wer

        start_time = time.time()
-        text_feature = self.test_loader.dataset.text_feature
+        text_feature = self.test_loader.collate_fn.text_feature
        target_transcripts = self.ordid2token(texts, texts_len)
        result_transcripts = self.model.decode(
            audio,
@@ -393,13 +426,14 @@ class U2Tester(U2Trainer):
            simulate_streaming=cfg.simulate_streaming)
        decode_time = time.time() - start_time

-        for target, result in zip(target_transcripts, result_transcripts):
+        for utt, target, result in zip(utts, target_transcripts,
+                                       result_transcripts):
            errors, len_ref = errors_func(target, result)
            errors_sum += errors
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(result + "\n")
+                fout.write(utt + " " + result + "\n")
            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
                        (target, result))
            logger.info("One example error rate [%s] = %f" %
@@ -421,7 +455,7 @@ class U2Tester(U2Trainer):
        self.model.eval()
        logger.info(f"Test Total Examples: {len(self.test_loader.dataset)}")

-        stride_ms = self.test_loader.dataset.stride_ms
+        stride_ms = self.test_loader.collate_fn.stride_ms
        error_rate_type = None
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        num_frames = 0.0
@@ -483,6 +517,73 @@ class U2Tester(U2Trainer):
        except KeyboardInterrupt:
            sys.exit(-1)

+    @paddle.no_grad()
+    def align(self):
+        if self.config.decoding.batch_size > 1:
+            logger.fatal('alignment mode must be running with batch_size == 1')
+            sys.exit(1)
+
+        # xxx.align
+        assert self.args.result_file and self.args.result_file.endswith(
+            '.align')
+
+        self.model.eval()
+        logger.info(f"Align Total Examples: {len(self.align_loader.dataset)}")
+
+        stride_ms = self.align_loader.collate_fn.stride_ms
+        token_dict = self.align_loader.collate_fn.vocab_list
+        with open(self.args.result_file, 'w') as fout:
+            # one example in batch
+            for i, batch in enumerate(self.align_loader):
+                key, feat, feats_length, target, target_length = batch
+
+                # 1. Encoder
+                encoder_out, encoder_mask = self.model._forward_encoder(
+                    feat, feats_length)  # (B, maxlen, encoder_dim)
+                maxlen = encoder_out.size(1)
+                ctc_probs = self.model.ctc.log_softmax(
+                    encoder_out)  # (1, maxlen, vocab_size)
+
+                # 2. alignment
+                ctc_probs = ctc_probs.squeeze(0)
+                target = target.squeeze(0)
+                alignment = ctc_utils.forced_align(ctc_probs, target)
+                logger.info("align ids", key[0], alignment)
+                fout.write('{} {}\n'.format(key[0], alignment))
+
+                # 3. gen praat
+                # segment alignment
+                align_segs = text_grid.segment_alignment(alignment)
+                logger.info("align tokens", key[0], align_segs)
+                # IntervalTier, List["start end token\n"]
+                subsample = utility.get_subsample(self.config)
+                tierformat = text_grid.align_to_tierformat(
+                    align_segs, subsample, token_dict)
+                # write tier
+                align_output_path = os.path.join(
+                    os.path.dirname(self.args.result_file), "align")
+                tier_path = os.path.join(align_output_path, key[0] + ".tier")
+                with open(tier_path, 'w') as f:
+                    f.writelines(tierformat)
+                # write textgrid
+                textgrid_path = os.path.join(align_output_path,
+                                             key[0] + ".TextGrid")
+                second_per_frame = 1. / (1000. /
+                                         stride_ms)  # 25ms window, 10ms stride
+                second_per_example = (
+                    len(alignment) + 1) * subsample * second_per_frame
+                text_grid.generate_textgrid(
+                    maxtime=second_per_example,
+                    intervals=tierformat,
+                    output=textgrid_path)
+
+    def run_align(self):
+        self.resume_or_scratch()
+        try:
+            self.align()
+        except KeyboardInterrupt:
+            sys.exit(-1)
+
    def load_inferspec(self):
        """infer model and input spec.

@@ -491,15 +592,14 @@ class U2Tester(U2Trainer):
            List[paddle.static.InputSpec]: input spec.
        """
        from deepspeech.models.u2 import U2InferModel
-        infer_model = U2InferModel.from_pretrained(self.test_loader.dataset,
+        infer_model = U2InferModel.from_pretrained(self.test_loader,
                                                   self.config.model.clone(),
                                                   self.args.checkpoint_path)
-        feat_dim = self.test_loader.dataset.feature_size
+        feat_dim = self.test_loader.collate_fn.feature_size
        input_spec = [
-            paddle.static.InputSpec(
-                shape=[None, feat_dim, None],
-                dtype='float32'),  # audio, [B,D,T]
-            paddle.static.InputSpec(shape=[None],
+            paddle.static.InputSpec(shape=[1, None, feat_dim],
+                                    dtype='float32'),  # audio, [B,T,D]
+            paddle.static.InputSpec(shape=[1],
                                    dtype='int64'),  # audio_length, [B]
        ]
        return infer_model, input_spec

--- a/deepspeech/exps/u2_st/__init__.py
+++ b/deepspeech/exps/u2_st/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech/exps/u2_st/bin/export.py
+++ b/deepspeech/exps/u2_st/bin/export.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export for U2 model."""
+from deepspeech.exps.u2_st.config import get_cfg_defaults
+from deepspeech.exps.u2_st.model import U2STTester as Tester
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    exp.setup()
+    exp.run_export()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    main(config, args)
--- a/deepspeech/exps/u2_st/bin/test.py
+++ b/deepspeech/exps/u2_st/bin/test.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+
+from deepspeech.exps.u2_st.config import get_cfg_defaults
+from deepspeech.exps.u2_st.model import U2STTester as Tester
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+
+# TODO(hui zhang): dynamic load 
+
+
+def main_sp(config, args):
+    exp = Tester(config, args)
+    exp.setup()
+    exp.run_test()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
--- a/deepspeech/exps/u2_st/bin/train.py
+++ b/deepspeech/exps/u2_st/bin/train.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+
+from paddle import distributed as dist
+
+from deepspeech.exps.u2_st.config import get_cfg_defaults
+from deepspeech.exps.u2_st.model import U2STTrainer as Trainer
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+
+
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    if args.device == "gpu" and args.nprocs > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
--- a/deepspeech/exps/u2_st/config.py
+++ b/deepspeech/exps/u2_st/config.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from yacs.config import CfgNode
+
+from deepspeech.exps.u2_st.model import U2STTester
+from deepspeech.exps.u2_st.model import U2STTrainer
+from deepspeech.io.collator_st import SpeechCollator
+from deepspeech.io.dataset import ManifestDataset
+from deepspeech.models.u2_st import U2STModel
+
+_C = CfgNode()
+
+_C.data = ManifestDataset.params()
+
+_C.collator = SpeechCollator.params()
+
+_C.model = U2STModel.params()
+
+_C.training = U2STTrainer.params()
+
+_C.decoding = U2STTester.params()
+
+
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    config = _C.clone()
+    config.set_new_allowed(True)
+    return config
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -107,7 +107,6 @@ class SpeechFeaturizer(object):
    @property
    def vocab_size(self):
        """Return the vocabulary size.
-
        Returns:
            int: Vocabulary size.
        """
@@ -116,7 +115,6 @@ class SpeechFeaturizer(object):
    @property
    def vocab_list(self):
        """Return the vocabulary in list.
-
        Returns:
            List[str]: 
        """
@@ -125,7 +123,6 @@ class SpeechFeaturizer(object):
    @property
    def vocab_dict(self):
        """Return the vocabulary in dict.
-
        Returns:
            Dict[str, int]: 
        """
@@ -134,7 +131,6 @@ class SpeechFeaturizer(object):
    @property
    def feature_size(self):
        """Return the audio feature size.
-
        Returns:
            int: audio feature size.
        """
@@ -143,7 +139,6 @@ class SpeechFeaturizer(object):
    @property
    def stride_ms(self):
        """time length in `ms` unit per frame
-
        Returns:
            float: time(ms)/frame
        """
@@ -152,7 +147,6 @@ class SpeechFeaturizer(object):
    @property
    def text_feature(self):
        """Return the text feature object.
-
        Returns:
            TextFeaturizer: object.
        """

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -11,8 +11,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import io
+from collections import namedtuple
+from typing import Optional
+
 import numpy as np
+from yacs.config import CfgNode

+from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
+from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
+from deepspeech.frontend.normalizer import FeatureNormalizer
+from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.io.utility import pad_sequence
 from deepspeech.utils.log import Log
@@ -21,17 +30,221 @@ __all__ = ["SpeechCollator"]

 logger = Log(__name__).getlog()

+# namedtupe need global for pickle.
+TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
+

 class SpeechCollator():
-    def __init__(self, keep_transcription_text=True):
+    @classmethod
+    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
+        default = CfgNode(
+            dict(
+                augmentation_config="",
+                random_seed=0,
+                mean_std_filepath="",
+                unit_type="char",
+                vocab_filepath="",
+                spm_model_prefix="",
+                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+                feat_dim=0,  # 'mfcc', 'fbank'
+                delta_delta=False,  # 'mfcc', 'fbank'
+                stride_ms=10.0,  # ms
+                window_ms=20.0,  # ms
+                n_fft=None,  # fft points
+                max_freq=None,  # None for samplerate/2
+                target_sample_rate=16000,  # target sample rate
+                use_dB_normalization=True,
+                target_dB=-20,
+                dither=1.0,  # feature dither
+                keep_transcription_text=False))
+
+        if config is not None:
+            config.merge_from_other_cfg(default)
+        return default
+
+    @classmethod
+    def from_config(cls, config):
+        """Build a SpeechCollator object from a config.
+
+        Args:
+            config (yacs.config.CfgNode): configs object.
+
+        Returns:
+            SpeechCollator: collator object.
        """
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one bach.
+        assert 'augmentation_config' in config.collator
+        assert 'keep_transcription_text' in config.collator
+        assert 'mean_std_filepath' in config.collator
+        assert 'vocab_filepath' in config.collator
+        assert 'specgram_type' in config.collator
+        assert 'n_fft' in config.collator
+        assert config.collator

-        if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        if isinstance(config.collator.augmentation_config, (str, bytes)):
+            if config.collator.augmentation_config:
+                aug_file = io.open(
+                    config.collator.augmentation_config,
+                    mode='r',
+                    encoding='utf8')
+            else:
+                aug_file = io.StringIO(initial_value='{}', newline='')
+        else:
+            aug_file = config.collator.augmentation_config
+            assert isinstance(aug_file, io.StringIO)
+
+        speech_collator = cls(
+            aug_file=aug_file,
+            random_seed=0,
+            mean_std_filepath=config.collator.mean_std_filepath,
+            unit_type=config.collator.unit_type,
+            vocab_filepath=config.collator.vocab_filepath,
+            spm_model_prefix=config.collator.spm_model_prefix,
+            specgram_type=config.collator.specgram_type,
+            feat_dim=config.collator.feat_dim,
+            delta_delta=config.collator.delta_delta,
+            stride_ms=config.collator.stride_ms,
+            window_ms=config.collator.window_ms,
+            n_fft=config.collator.n_fft,
+            max_freq=config.collator.max_freq,
+            target_sample_rate=config.collator.target_sample_rate,
+            use_dB_normalization=config.collator.use_dB_normalization,
+            target_dB=config.collator.target_dB,
+            dither=config.collator.dither,
+            keep_transcription_text=config.collator.keep_transcription_text)
+        return speech_collator
+
+    def __init__(
+            self,
+            aug_file,
+            mean_std_filepath,
+            vocab_filepath,
+            spm_model_prefix,
+            random_seed=0,
+            unit_type="char",
+            specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
+            feat_dim=0,  # 'mfcc', 'fbank'
+            delta_delta=False,  # 'mfcc', 'fbank'
+            stride_ms=10.0,  # ms
+            window_ms=20.0,  # ms
+            n_fft=None,  # fft points
+            max_freq=None,  # None for samplerate/2
+            target_sample_rate=16000,  # target sample rate
+            use_dB_normalization=True,
+            target_dB=-20,
+            dither=1.0,
+            keep_transcription_text=True):
+        """SpeechCollator Collator
+
+        Args:
+            unit_type(str): token unit type, e.g. char, word, spm
+            vocab_filepath (str): vocab file path.
+            mean_std_filepath (str): mean and std file path, which suffix is *.npy
+            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
+            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
+            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
+            window_ms (float, optional): window size in ms. Defaults to 20.0.
+            n_fft (int, optional): fft points for rfft. Defaults to None.
+            max_freq (int, optional): max cut freq. Defaults to None.
+            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
+            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
+            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
+            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
+            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
+            target_dB (int, optional): target dB. Defaults to -20.
+            random_seed (int, optional): for random generator. Defaults to 0.
+            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+            if ``keep_transcription_text`` is False, text is token ids else is raw string.
+
+        Do augmentations
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one batch.
        """
        self._keep_transcription_text = keep_transcription_text

+        self._local_data = TarLocalData(tar2info={}, tar2object={})
+        self._augmentation_pipeline = AugmentationPipeline(
+            augmentation_config=aug_file.read(), random_seed=random_seed)
+
+        self._normalizer = FeatureNormalizer(
+            mean_std_filepath) if mean_std_filepath else None
+
+        self._stride_ms = stride_ms
+        self._target_sample_rate = target_sample_rate
+
+        self._speech_featurizer = SpeechFeaturizer(
+            unit_type=unit_type,
+            vocab_filepath=vocab_filepath,
+            spm_model_prefix=spm_model_prefix,
+            specgram_type=specgram_type,
+            feat_dim=feat_dim,
+            delta_delta=delta_delta,
+            stride_ms=stride_ms,
+            window_ms=window_ms,
+            n_fft=n_fft,
+            max_freq=max_freq,
+            target_sample_rate=target_sample_rate,
+            use_dB_normalization=use_dB_normalization,
+            target_dB=target_dB,
+            dither=dither)
+
+    def _parse_tar(self, file):
+        """Parse a tar file to get a tarfile object
+        and a map containing tarinfoes
+        """
+        result = {}
+        f = tarfile.open(file)
+        for tarinfo in f.getmembers():
+            result[tarinfo.name] = tarinfo
+        return f, result
+
+    def _subfile_from_tar(self, file):
+        """Get subfile object from tar.
+
+        It will return a subfile object from tar file
+        and cached tar file info for next reading request.
+        """
+        tarpath, filename = file.split(':', 1)[1].split('#', 1)
+        if 'tar2info' not in self._local_data.__dict__:
+            self._local_data.tar2info = {}
+        if 'tar2object' not in self._local_data.__dict__:
+            self._local_data.tar2object = {}
+        if tarpath not in self._local_data.tar2info:
+            object, infoes = self._parse_tar(tarpath)
+            self._local_data.tar2info[tarpath] = infoes
+            self._local_data.tar2object[tarpath] = object
+        return self._local_data.tar2object[tarpath].extractfile(
+            self._local_data.tar2info[tarpath][filename])
+
+    def process_utterance(self, audio_file, transcript):
+        """Load, augment, featurize and normalize for speech data.
+
+        :param audio_file: Filepath or file object of audio file.
+        :type audio_file: str | file
+        :param transcript: Transcription text.
+        :type transcript: str
+        :return: Tuple of audio feature tensor and data of transcription part,
+                 where transcription part could be token ids or text.
+        :rtype: tuple of (2darray, list)
+        """
+        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
+            speech_segment = SpeechSegment.from_file(
+                self._subfile_from_tar(audio_file), transcript)
+        else:
+            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+
+        # audio augment
+        self._augmentation_pipeline.transform_audio(speech_segment)
+
+        specgram, transcript_part = self._speech_featurizer.featurize(
+            speech_segment, self._keep_transcription_text)
+        if self._normalizer:
+            specgram = self._normalizer.apply(specgram)
+
+        # specgram augment
+        specgram = self._augmentation_pipeline.transform_feature(specgram)
+        specgram = specgram.transpose([1, 0])
+        return specgram, transcript_part
+
    def __call__(self, batch):
        """batch examples

@@ -51,10 +264,14 @@ class SpeechCollator():
        audio_lens = []
        texts = []
        text_lens = []
-        for audio, text in batch:
+        utts = []
+        for utt, audio, text in batch:
+            audio, text = self.process_utterance(audio, text)
+            #utt
+            utts.append(utt)
            # audio
-            audios.append(audio.T)  # [T, D]
-            audio_lens.append(audio.shape[1])
+            audios.append(audio)  # [T, D]
+            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord
@@ -75,4 +292,32 @@ class SpeechCollator():
        padded_texts = pad_sequence(
            texts, padding_value=IGNORE_ID).astype(np.int64)
        text_lens = np.array(text_lens).astype(np.int64)
-        return padded_audios, audio_lens, padded_texts, text_lens
+        return utts, padded_audios, audio_lens, padded_texts, text_lens
+
+    @property
+    def manifest(self):
+        return self._manifest
+
+    @property
+    def vocab_size(self):
+        return self._speech_featurizer.vocab_size
+
+    @property
+    def vocab_list(self):
+        return self._speech_featurizer.vocab_list
+
+    @property
+    def vocab_dict(self):
+        return self._speech_featurizer.vocab_dict
+
+    @property
+    def text_feature(self):
+        return self._speech_featurizer.text_feature
+
+    @property
+    def feature_size(self):
+        return self._speech_featurizer.feature_size
+
+    @property
+    def stride_ms(self):
+        return self._speech_featurizer.stride_ms
--- a/deepspeech/io/collator_st.py
+++ b/deepspeech/io/collator_st.py
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -11,72 +11,31 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import io
-import tarfile
-import time
-from collections import namedtuple
 from typing import Optional

-import numpy as np
 from paddle.io import Dataset
 from yacs.config import CfgNode

-from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
-from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
-from deepspeech.frontend.normalizer import FeatureNormalizer
-from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log

-__all__ = [
-    "ManifestDataset",
-]
+__all__ = ["ManifestDataset", "TripletManifestDataset"]

 logger = Log(__name__).getlog()

-# namedtupe need global for pickle.
-TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
-

 class ManifestDataset(Dataset):
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
-                train_manifest="",
-                dev_manifest="",
-                test_manifest="",
                manifest="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                mean_std_filepath="",
-                augmentation_config="",
                max_input_len=27.0,
                min_input_len=0.0,
                max_output_len=float('inf'),
                min_output_len=0.0,
                max_output_input_ratio=float('inf'),
-                min_output_input_ratio=0.0,
-                stride_ms=10.0,  # ms
-                window_ms=20.0,  # ms
-                n_fft=None,  # fft points
-                max_freq=None,  # None for samplerate/2
-                raw_wav=True,  # use raw_wav or kaldi feature
-                specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
-                feat_dim=0,  # 'mfcc', 'fbank'
-                delta_delta=False,  # 'mfcc', 'fbank'
-                dither=1.0,  # feature dither
-                target_sample_rate=16000,  # target sample rate
-                use_dB_normalization=True,
-                target_dB=-20,
-                random_seed=0,
-                keep_transcription_text=False,
-                batch_size=32,  # batch size
-                num_workers=0,  # data loader workers
-                sortagrad=False,  # sorted in first epoch when True
-                shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
-            ))
+                min_output_input_ratio=0.0, ))

        if config is not None:
            config.merge_from_other_cfg(default)
@@ -94,128 +53,38 @@ class ManifestDataset(Dataset):
        """
        assert 'manifest' in config.data
        assert config.data.manifest
-        assert 'keep_transcription_text' in config.data
-
-        if isinstance(config.data.augmentation_config, (str, bytes)):
-            if config.data.augmentation_config:
-                aug_file = io.open(
-                    config.data.augmentation_config, mode='r', encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.data.augmentation_config
-            assert isinstance(aug_file, io.StringIO)

        dataset = cls(
            manifest_path=config.data.manifest,
-            unit_type=config.data.unit_type,
-            vocab_filepath=config.data.vocab_filepath,
-            mean_std_filepath=config.data.mean_std_filepath,
-            spm_model_prefix=config.data.spm_model_prefix,
-            augmentation_config=aug_file.read(),
            max_input_len=config.data.max_input_len,
            min_input_len=config.data.min_input_len,
            max_output_len=config.data.max_output_len,
            min_output_len=config.data.min_output_len,
            max_output_input_ratio=config.data.max_output_input_ratio,
-            min_output_input_ratio=config.data.min_output_input_ratio,
-            stride_ms=config.data.stride_ms,
-            window_ms=config.data.window_ms,
-            n_fft=config.data.n_fft,
-            max_freq=config.data.max_freq,
-            target_sample_rate=config.data.target_sample_rate,
-            specgram_type=config.data.specgram_type,
-            feat_dim=config.data.feat_dim,
-            delta_delta=config.data.delta_delta,
-            dither=config.data.dither,
-            use_dB_normalization=config.data.use_dB_normalization,
-            target_dB=config.data.target_dB,
-            random_seed=config.data.random_seed,
-            keep_transcription_text=config.data.keep_transcription_text)
+            min_output_input_ratio=config.data.min_output_input_ratio, )
        return dataset

    def __init__(self,
                 manifest_path,
-                 unit_type,
-                 vocab_filepath,
-                 mean_std_filepath,
-                 spm_model_prefix=None,
-                 augmentation_config='{}',
                 max_input_len=float('inf'),
                 min_input_len=0.0,
                 max_output_len=float('inf'),
                 min_output_len=0.0,
                 max_output_input_ratio=float('inf'),
-                 min_output_input_ratio=0.0,
-                 stride_ms=10.0,
-                 window_ms=20.0,
-                 n_fft=None,
-                 max_freq=None,
-                 target_sample_rate=16000,
-                 specgram_type='linear',
-                 feat_dim=None,
-                 delta_delta=False,
-                 dither=1.0,
-                 use_dB_normalization=True,
-                 target_dB=-20,
-                 random_seed=0,
-                 keep_transcription_text=False):
+                 min_output_input_ratio=0.0):
        """Manifest Dataset

        Args:
            manifest_path (str): manifest josn file path
-            unit_type(str): token unit type, e.g. char, word, spm
-            vocab_filepath (str): vocab file path.
-            mean_std_filepath (str): mean and std file path, which suffix is *.npy
-            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
-            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
            max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
            min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
            max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
            min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
-            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
-            window_ms (float, optional): window size in ms. Defaults to 20.0.
-            n_fft (int, optional): fft points for rfft. Defaults to None.
-            max_freq (int, optional): max cut freq. Defaults to None.
-            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
-            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
-            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
-            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
-            target_dB (int, optional): target dB. Defaults to -20.
-            random_seed (int, optional): for random generator. Defaults to 0.
-            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+        
        """
        super().__init__()
-        self._stride_ms = stride_ms
-        self._target_sample_rate = target_sample_rate
-
-        self._normalizer = FeatureNormalizer(
-            mean_std_filepath) if mean_std_filepath else None
-        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=augmentation_config, random_seed=random_seed)
-        self._speech_featurizer = SpeechFeaturizer(
-            unit_type=unit_type,
-            vocab_filepath=vocab_filepath,
-            spm_model_prefix=spm_model_prefix,
-            specgram_type=specgram_type,
-            feat_dim=feat_dim,
-            delta_delta=delta_delta,
-            stride_ms=stride_ms,
-            window_ms=window_ms,
-            n_fft=n_fft,
-            max_freq=max_freq,
-            target_sample_rate=target_sample_rate,
-            use_dB_normalization=use_dB_normalization,
-            target_dB=target_dB,
-            dither=dither)
-
-        self._rng = np.random.RandomState(random_seed)
-        self._keep_transcription_text = keep_transcription_text
-        # for caching tar files info
-        self._local_data = TarLocalData(tar2info={}, tar2object={})

        # read manifest
        self._manifest = read_manifest(
@@ -228,123 +97,22 @@ class ManifestDataset(Dataset):
            min_output_input_ratio=min_output_input_ratio)
        self._manifest.sort(key=lambda x: x["feat_shape"][0])

-    @property
-    def manifest(self):
-        return self._manifest
-
-    @property
-    def vocab_size(self):
-        return self._speech_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        return self._speech_featurizer.vocab_list
-
-    @property
-    def vocab_dict(self):
-        return self._speech_featurizer.vocab_dict
-
-    @property
-    def text_feature(self):
-        return self._speech_featurizer.text_feature
-
-    @property
-    def feature_size(self):
-        return self._speech_featurizer.feature_size
-
-    @property
-    def stride_ms(self):
-        return self._speech_featurizer.stride_ms
-
-    def _parse_tar(self, file):
-        """Parse a tar file to get a tarfile object
-        and a map containing tarinfoes
-        """
-        result = {}
-        f = tarfile.open(file)
-        for tarinfo in f.getmembers():
-            result[tarinfo.name] = tarinfo
-        return f, result
-
-    def _subfile_from_tar(self, file):
-        """Get subfile object from tar.
-
-        It will return a subfile object from tar file
-        and cached tar file info for next reading request.
-        """
-        tarpath, filename = file.split(':', 1)[1].split('#', 1)
-        if 'tar2info' not in self._local_data.__dict__:
-            self._local_data.tar2info = {}
-        if 'tar2object' not in self._local_data.__dict__:
-            self._local_data.tar2object = {}
-        if tarpath not in self._local_data.tar2info:
-            object, infoes = self._parse_tar(tarpath)
-            self._local_data.tar2info[tarpath] = infoes
-            self._local_data.tar2object[tarpath] = object
-        return self._local_data.tar2object[tarpath].extractfile(
-            self._local_data.tar2info[tarpath][filename])
-
-    def process_utterance(self, audio_file, transcript):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of audio file.
-        :type audio_file: str | file
-        :param transcript: Transcription text.
-        :type transcript: str
-        :return: Tuple of audio feature tensor and data of transcription part,
-                 where transcription part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        start_time = time.time()
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), transcript)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, transcript)
-        load_wav_time = time.time() - start_time
-        #logger.debug(f"load wav time: {load_wav_time}")
-
-        # audio augment
-        start_time = time.time()
-        self._augmentation_pipeline.transform_audio(speech_segment)
-        audio_aug_time = time.time() - start_time
-        #logger.debug(f"audio augmentation time: {audio_aug_time}")
-
-        start_time = time.time()
-        specgram, transcript_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
-        feature_time = time.time() - start_time
-        #logger.debug(f"audio & test feature time: {feature_time}")
-
-        # specgram augment
-        start_time = time.time()
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        feature_aug_time = time.time() - start_time
-        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
-        return specgram, transcript_part
-
-    def _instance_reader_creator(self, manifest):
-        """
-        Instance reader creator. Create a callable function to produce
-        instances of data.
-
-        Instance: a tuple of ndarray of audio spectrogram and a list of
-        token indices for transcript.
-        """
+    def __len__(self):
+        return len(self._manifest)

-        def reader():
-            for instance in manifest:
-                inst = self.process_utterance(instance["feat"],
-                                              instance["text"])
-                yield inst
+    def __getitem__(self, idx):
+        instance = self._manifest[idx]
+        return instance["utt"], instance["feat"], instance["text"]

-        return reader

-    def __len__(self):
-        return len(self._manifest)
+class TripletManifestDataset(ManifestDataset):
+    """
+        For Joint Training of Speech Translation and ASR.
+        text: translation,
+        text1: transcript.
+    """

    def __getitem__(self, idx):
        instance = self._manifest[idx]
-        return self.process_utterance(instance["feat"], instance["text"])
+        return instance["utt"], instance["feat"], instance["text"], instance[
+            "text1"]
--- a/deepspeech/io/sampler.py
+++ b/deepspeech/io/sampler.py
@@ -51,7 +51,7 @@ def _batch_shuffle(indices, batch_size, epoch, clipped=False):
    """
    rng = np.random.RandomState(epoch)
    shift_len = rng.randint(0, batch_size - 1)
-    batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size))
+    batch_indices = list(zip(*[iter(indices[shift_len:])] * batch_size))
    rng.shuffle(batch_indices)
    batch_indices = [item for batch in batch_indices for item in batch]
    assert clipped is False

--- a/deepspeech/models/ds2/__init__.py
+++ b/deepspeech/models/ds2/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .deepspeech2 import DeepSpeech2InferModel
+from .deepspeech2 import DeepSpeech2Model
+
+__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']
--- a/deepspeech/models/ds2/conv.py
+++ b/deepspeech/models/ds2/conv.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle import nn
+from paddle.nn import functional as F
+
+from deepspeech.modules.activation import brelu
+from deepspeech.modules.mask import make_non_pad_mask
+from deepspeech.utils.log import Log
+
+logger = Log(__name__).getlog()
+
+__all__ = ['ConvStack', "conv_output_size"]
+
+
+def conv_output_size(I, F, P, S):
+    # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
+    # Output size after Conv:
+    #   By noting I the length of the input volume size, 
+    #   F the length of the filter, 
+    #   P the amount of zero padding, 
+    #   S the stride,
+    #   then the output size O of the feature map along that dimension is given by:
+    #       O = (I - F + Pstart + Pend) // S + 1
+    #   When Pstart == Pend == P, we can replace Pstart + Pend by 2P.
+    #   When Pstart == Pend == 0
+    #       O = (I - F - S) // S
+    # https://iq.opengenus.org/output-size-of-convolution/
+    # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1
+    # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1
+    return (I - F + 2 * P - S) // S
+
+
+# receptive field calculator
+# https://fomoro.com/research/article/receptive-field-calculator
+# https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters
+# https://distill.pub/2019/computing-receptive-fields/
+# Rl-1 = Sl * Rl + (Kl - Sl) 
+
+
+class ConvBn(nn.Layer):
+    """Convolution layer with batch normalization.
+
+    :param kernel_size: The x dimension of a filter kernel. Or input a tuple for
+                        two image dimension.
+    :type kernel_size: int|tuple|list
+    :param num_channels_in: Number of input channels.
+    :type num_channels_in: int
+    :param num_channels_out: Number of output channels.
+    :type num_channels_out: int
+    :param stride: The x dimension of the stride. Or input a tuple for two 
+                image dimension. 
+    :type stride: int|tuple|list
+    :param padding: The x dimension of the padding. Or input a tuple for two
+                    image dimension.
+    :type padding: int|tuple|list
+    :param act: Activation type, relu|brelu
+    :type act: string
+    :return: Batch norm layer after convolution layer.
+    :rtype: Variable
+
+    """
+
+    def __init__(self, num_channels_in, num_channels_out, kernel_size, stride,
+                 padding, act):
+
+        super().__init__()
+        assert len(kernel_size) == 2
+        assert len(stride) == 2
+        assert len(padding) == 2
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.padding = padding
+
+        self.conv = nn.Conv2D(
+            num_channels_in,
+            num_channels_out,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            weight_attr=None,
+            bias_attr=False,
+            data_format='NCHW')
+
+        self.bn = nn.BatchNorm2D(
+            num_channels_out,
+            weight_attr=None,
+            bias_attr=None,
+            data_format='NCHW')
+        self.act = F.relu if act == 'relu' else brelu
+
+    def forward(self, x, x_len):
+        """
+        x(Tensor): audio, shape [B, C, D, T]
+        """
+        x = self.conv(x)
+        x = self.bn(x)
+        x = self.act(x)
+
+        x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1]
+                 ) // self.stride[1] + 1
+
+        # reset padding part to 0
+        masks = make_non_pad_mask(x_len)  #[B, T]
+        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
+        # TODO(Hui Zhang): not support bool multiply
+        # masks = masks.type_as(x)
+        masks = masks.astype(x.dtype)
+        x = x.multiply(masks)
+
+        return x, x_len
+
+
+class ConvStack(nn.Layer):
+    """Convolution group with stacked convolution layers.
+
+    :param feat_size: audio feature dim.
+    :type feat_size: int
+    :param num_stacks: Number of stacked convolution layers.
+    :type num_stacks: int
+    """
+
+    def __init__(self, feat_size, num_stacks):
+        super().__init__()
+        self.feat_size = feat_size  # D
+        self.num_stacks = num_stacks
+
+        self.conv_in = ConvBn(
+            num_channels_in=1,
+            num_channels_out=32,
+            kernel_size=(41, 11),  #[D, T]
+            stride=(2, 3),
+            padding=(20, 5),
+            act='brelu')
+
+        out_channel = 32
+        convs = [
+            ConvBn(
+                num_channels_in=32,
+                num_channels_out=out_channel,
+                kernel_size=(21, 11),
+                stride=(2, 1),
+                padding=(10, 5),
+                act='brelu') for i in range(num_stacks - 1)
+        ]
+        self.conv_stack = nn.LayerList(convs)
+
+        # conv output feat_dim
+        output_height = (feat_size - 1) // 2 + 1
+        for i in range(self.num_stacks - 1):
+            output_height = (output_height - 1) // 2 + 1
+        self.output_height = out_channel * output_height
+
+    def forward(self, x, x_len):
+        """
+        x: shape [B, C, D, T]
+        x_len : shape [B]
+        """
+        x, x_len = self.conv_in(x, x_len)
+        for i, conv in enumerate(self.conv_stack):
+            x, x_len = conv(x, x_len)
+        return x, x_len
--- a/deepspeech/models/deepspeech2.py
+++ b/deepspeech/models/deepspeech2.py
@@ -18,16 +18,16 @@ import paddle
 from paddle import nn
 from yacs.config import CfgNode

-from deepspeech.modules.conv import ConvStack
+from deepspeech.models.ds2.conv import ConvStack
+from deepspeech.models.ds2.rnn import RNNStack
 from deepspeech.modules.ctc import CTCDecoder
-from deepspeech.modules.rnn import RNNStack
-from deepspeech.utils import checkpoint
 from deepspeech.utils import layer_tools
+from deepspeech.utils.checkpoint import Checkpoint
 from deepspeech.utils.log import Log

 logger = Log(__name__).getlog()

-__all__ = ['DeepSpeech2Model']
+__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel']


 class CRNNEncoder(nn.Layer):
@@ -117,7 +117,7 @@ class DeepSpeech2Model(nn.Layer):
    :type share_weights: bool
    :return: A tuple of an output unnormalized log probability layer (
             before softmax) and a ctc cost layer.
-    :rtype: tuple of LayerOutput    
+    :rtype: tuple of LayerOutput
    """

    @classmethod
@@ -198,36 +198,57 @@ class DeepSpeech2Model(nn.Layer):
            cutoff_top_n, num_processes)

    @classmethod
-    def from_pretrained(cls, dataset, config, checkpoint_path):
+    def from_pretrained(cls, dataloader, config, checkpoint_path):
        """Build a DeepSpeech2Model model from a pretrained model.
        Parameters
        ----------
-        dataset: paddle.io.Dataset
+        dataloader: paddle.io.DataLoader

        config: yacs.config.CfgNode
            model configs
-        
+
        checkpoint_path: Path or str
            the path of pretrained model checkpoint, without extension name
-        
+
        Returns
        -------
        DeepSpeech2Model
            The model built from pretrained result.
        """
-        model = cls(feat_size=dataset.feature_size,
-                    dict_size=dataset.vocab_size,
+        model = cls(feat_size=dataloader.collate_fn.feature_size,
+                    dict_size=dataloader.collate_fn.vocab_size,
                    num_conv_layers=config.model.num_conv_layers,
                    num_rnn_layers=config.model.num_rnn_layers,
                    rnn_size=config.model.rnn_layer_size,
                    use_gru=config.model.use_gru,
                    share_rnn_weights=config.model.share_rnn_weights)
-        infos = checkpoint.load_parameters(
+        infos = Checkpoint().load_parameters(
            model, checkpoint_path=checkpoint_path)
        logger.info(f"checkpoint info: {infos}")
        layer_tools.summary(model)
        return model

+    @classmethod
+    def from_config(cls, config):
+        """Build a DeepSpeec2Model from config
+        Parameters
+
+        config: yacs.config.CfgNode
+            config.model
+        Returns
+        -------
+        DeepSpeech2Model
+            The model built from config.
+        """
+        model = cls(feat_size=config.feat_size,
+                    dict_size=config.dict_size,
+                    num_conv_layers=config.num_conv_layers,
+                    num_rnn_layers=config.num_rnn_layers,
+                    rnn_size=config.rnn_layer_size,
+                    use_gru=config.use_gru,
+                    share_rnn_weights=config.share_rnn_weights)
+        return model
+

 class DeepSpeech2InferModel(DeepSpeech2Model):
    def __init__(self,
@@ -260,3 +281,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
        eouts, eouts_len = self.encoder(audio, audio_len)
        probs = self.decoder.softmax(eouts)
        return probs
+
+    def export(self):
+        static_model = paddle.jit.to_static(
+            self,
+            input_spec=[
+                paddle.static.InputSpec(
+                    shape=[None, None, self.encoder.feat_size],
+                    dtype='float32'),  # audio, [B,T,D]
+                paddle.static.InputSpec(shape=[None],
+                                        dtype='int64'),  # audio_length, [B]
+            ])
+        return static_model
--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
--- a/deepspeech/models/ds2_online/__init__.py
+++ b/deepspeech/models/ds2_online/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .deepspeech2 import DeepSpeech2InferModelOnline
+from .deepspeech2 import DeepSpeech2ModelOnline
+
+__all__ = ['DeepSpeech2ModelOnline', 'DeepSpeech2InferModelOnline']
--- a/deepspeech/models/ds2_online/conv.py
+++ b/deepspeech/models/ds2_online/conv.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+from deepspeech.modules.embedding import PositionalEncoding
+from deepspeech.modules.subsampling import Conv2dSubsampling4
+
+
+class Conv2dSubsampling4Online(Conv2dSubsampling4):
+    def __init__(self, idim: int, odim: int, dropout_rate: float):
+        super().__init__(idim, odim, dropout_rate, None)
+        self.output_dim = ((idim - 1) // 2 - 1) // 2 * odim
+        self.receptive_field_length = 2 * (
+            3 - 1) + 3  # stride_1 * (kernel_size_2 - 1) + kerel_size_1
+
+    def forward(self, x: paddle.Tensor,
+                x_len: paddle.Tensor) -> [paddle.Tensor, paddle.Tensor]:
+        x = x.unsqueeze(1)  # (b, c=1, t, f)
+        x = self.conv(x)
+        #b, c, t, f = paddle.shape(x) #not work under jit
+        x = x.transpose([0, 2, 1, 3]).reshape([0, 0, -1])
+        x_len = ((x_len - 1) // 2 - 1) // 2
+        return x, x_len
--- a/deepspeech/models/ds2_online/deepspeech2.py
+++ b/deepspeech/models/ds2_online/deepspeech2.py
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
--- a/deepspeech/modules/conv.py
+++ b/deepspeech/modules/conv.py
@@ -114,7 +114,8 @@ class ConvBn(nn.Layer):
        masks = make_non_pad_mask(x_len)  #[B, T]
        masks = masks.unsqueeze(1).unsqueeze(1)  # [B, 1, 1, T]
        # TODO(Hui Zhang): not support bool multiply
-        masks = masks.type_as(x)
+        # masks = masks.type_as(x)
+        masks = masks.astype(x.dtype)
        x = x.multiply(masks)

        return x, x_len

--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@@ -219,11 +219,14 @@ class BaseEncoder(nn.Layer):

        xs, pos_emb, _ = self.embed(
            xs, tmp_masks, offset=offset)  #xs=(B, T, D), pos_emb=(B=1, T, D)
+
        if subsampling_cache is not None:
            cache_size = subsampling_cache.size(1)  #T
            xs = paddle.cat((subsampling_cache, xs), dim=1)
        else:
            cache_size = 0
+
+        # only used when using `RelPositionMultiHeadedAttention`
        pos_emb = self.embed.position_encoding(
            offset=offset - cache_size, size=xs.size(1))

@@ -237,7 +240,7 @@ class BaseEncoder(nn.Layer):

        # Real mask for transformer/conformer layers
        masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool)
-        masks = masks.unsqueeze(1)  #[B=1, C=1, T]
+        masks = masks.unsqueeze(1)  #[B=1, L'=1, T]
        r_elayers_output_cache = []
        r_conformer_cnn_cache = []
        for i, layer in enumerate(self.encoders):

--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@@ -309,6 +309,6 @@ class RNNStack(nn.Layer):
            masks = make_non_pad_mask(x_len)  #[B, T]
            masks = masks.unsqueeze(-1)  # [B, T, 1]
            # TODO(Hui Zhang): not support bool multiply
-            masks = masks.type_as(x)
+            masks = masks.astype(x.dtype)
            x = x.multiply(masks)
        return x, x_len
--- a/deepspeech/modules/subsampling.py
+++ b/deepspeech/modules/subsampling.py
@@ -92,7 +92,7 @@ class Conv2dSubsampling4(BaseSubsampling):
                 dropout_rate: float,
                 pos_enc_class: nn.Layer=PositionalEncoding):
        """Construct an Conv2dSubsampling4 object.
-        
+
        Args:
            idim (int): Input dimension.
            odim (int): Output dimension.
@@ -143,7 +143,7 @@ class Conv2dSubsampling6(BaseSubsampling):
                 dropout_rate: float,
                 pos_enc_class: nn.Layer=PositionalEncoding):
        """Construct an Conv2dSubsampling6 object.
-        
+
        Args:
            idim (int): Input dimension.
            odim (int): Output dimension.
@@ -196,7 +196,7 @@ class Conv2dSubsampling8(BaseSubsampling):
                 dropout_rate: float,
                 pos_enc_class: nn.Layer=PositionalEncoding):
        """Construct an Conv2dSubsampling8 object.
-        
+
        Args:
            idim (int): Input dimension.
            odim (int): Output dimension.

--- a/deepspeech/training/gradclip.py
+++ b/deepspeech/training/gradclip.py
@@ -27,6 +27,9 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
    def __init__(self, clip_norm):
        super().__init__(clip_norm)

+    def __repr__(self):
+        return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
+
    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        params_and_grads = []

--- a/deepspeech/training/optimizer.py
+++ b/deepspeech/training/optimizer.py
--- a/deepspeech/training/scheduler.py
+++ b/deepspeech/training/scheduler.py
--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
--- a/deepspeech/utils/bleu_score.py
+++ b/deepspeech/utils/bleu_score.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This module provides functions to calculate bleu score in different level.
+e.g. wer for word-level, cer for char-level.
+"""
+import sacrebleu
+
+__all__ = ['bleu', 'char_bleu']
+
+
+def bleu(hypothesis, reference):
+    """Calculate BLEU. BLEU compares reference text and
+    hypothesis text in word-level using scarebleu.
+
+   
+
+    :param reference: The reference sentences.
+    :type reference: list[list[str]]
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: list[str]
+    :raises ValueError: If the reference length is zero.
+    """
+
+    return sacrebleu.corpus_bleu(hypothesis, reference)
+
+
+def char_bleu(hypothesis, reference):
+    """Calculate BLEU. BLEU compares reference text and
+    hypothesis text in char-level using scarebleu.
+
+   
+
+    :param reference: The reference sentences.
+    :type reference: list[list[str]]
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: list[str]
+    :raises ValueError: If the reference number is zero.
+    """
+    hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis]
+    reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref]
+                 for ref in reference]
+
+    return sacrebleu.corpus_bleu(hypothesis, reference)
--- a/deepspeech/utils/checkpoint.py
+++ b/deepspeech/utils/checkpoint.py
--- a/deepspeech/utils/ctc_utils.py
+++ b/deepspeech/utils/ctc_utils.py
--- a/deepspeech/utils/dynamic_import.py
+++ b/deepspeech/utils/dynamic_import.py
--- a/deepspeech/utils/log.py
+++ b/deepspeech/utils/log.py
--- a/deepspeech/utils/socket_server.py
+++ b/deepspeech/utils/socket_server.py
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
--- a/deepspeech/utils/text_grid.py
+++ b/deepspeech/utils/text_grid.py
--- a/deepspeech/utils/utility.py
+++ b/deepspeech/utils/utility.py
--- a/env.sh
+++ b/env.sh
--- a/examples/aishell/README.md
+++ b/examples/aishell/README.md
--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
--- a/examples/aishell/s0/local/client.sh
+++ b/examples/aishell/s0/local/client.sh
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
--- a/examples/aishell/s0/local/download_lm_ch.sh
+++ b/examples/aishell/s0/local/download_lm_ch.sh
--- a/examples/aishell/s0/local/export.sh
+++ b/examples/aishell/s0/local/export.sh
--- a/examples/aishell/s0/local/server.sh
+++ b/examples/aishell/s0/local/server.sh
--- a/examples/aishell/s0/local/test.sh
+++ b/examples/aishell/s0/local/test.sh
--- a/examples/aishell/s0/local/train.sh
+++ b/examples/aishell/s0/local/train.sh
--- a/examples/aishell/s0/local/tune.sh
+++ b/examples/aishell/s0/local/tune.sh
--- a/examples/aishell/s0/path.sh
+++ b/examples/aishell/s0/path.sh
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
--- a/examples/aishell/s1/README.md
+++ b/examples/aishell/s1/README.md
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
--- a/examples/aishell/s1/local/aishell_train_lms.sh
+++ b/examples/aishell/s1/local/aishell_train_lms.sh
--- a/examples/aishell/s1/local/align.sh
+++ b/examples/aishell/s1/local/align.sh
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
--- a/examples/aishell/s1/local/download_lm_ch.sh
+++ b/examples/aishell/s1/local/download_lm_ch.sh
-../../s0/local/download_lm_ch.sh
\ No newline at end of file
--- a/examples/aishell/s1/local/export.sh
+++ b/examples/aishell/s1/local/export.sh
--- a/examples/aishell/s1/local/test.sh
+++ b/examples/aishell/s1/local/test.sh
--- a/examples/aishell/s1/local/tlg.sh
+++ b/examples/aishell/s1/local/tlg.sh
--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
--- a/examples/aishell/s1/run.sh
+++ b/examples/aishell/s1/run.sh
--- a/examples/aishell/s1/utils
+++ b/examples/aishell/s1/utils
+../../../utils
\ No newline at end of file
--- a/examples/callcenter/s1/.gitignore
+++ b/examples/callcenter/s1/.gitignore
--- a/examples/callcenter/s1/README.md
+++ b/examples/callcenter/s1/README.md
--- a/examples/callcenter/s1/conf/augmentation.json
+++ b/examples/callcenter/s1/conf/augmentation.json
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
--- a/examples/callcenter/s1/local/align.sh
+++ b/examples/callcenter/s1/local/align.sh
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
--- a/examples/callcenter/s1/local/download_lm_ch.sh
+++ b/examples/callcenter/s1/local/download_lm_ch.sh
--- a/examples/callcenter/s1/local/export.sh
+++ b/examples/callcenter/s1/local/export.sh
--- a/examples/callcenter/s1/local/test.sh
+++ b/examples/callcenter/s1/local/test.sh
--- a/examples/callcenter/s1/local/train.sh
+++ b/examples/callcenter/s1/local/train.sh
--- a/examples/callcenter/s1/path.sh
+++ b/examples/callcenter/s1/path.sh
--- a/examples/callcenter/s1/run.sh
+++ b/examples/callcenter/s1/run.sh
--- a/examples/cc-cedict/path.sh
+++ b/examples/cc-cedict/path.sh
--- a/examples/chinese_g2p/path.sh
+++ b/examples/chinese_g2p/path.sh
--- a/examples/dataset/aidatatang_200zh/.gitignore
+++ b/examples/dataset/aidatatang_200zh/.gitignore
--- a/examples/dataset/aidatatang_200zh/README.md
+++ b/examples/dataset/aidatatang_200zh/README.md
--- a/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
+++ b/examples/dataset/aidatatang_200zh/aidatatang_200zh.py
--- a/examples/dataset/aishell/.gitignore
+++ b/examples/dataset/aishell/.gitignore
--- a/examples/dataset/aishell/README.md
+++ b/examples/dataset/aishell/README.md
--- a/examples/dataset/aishell/aishell.py
+++ b/examples/dataset/aishell/aishell.py
--- a/examples/dataset/aishell3/README.md
+++ b/examples/dataset/aishell3/README.md
--- a/examples/dataset/gigaspeech/.gitignore
+++ b/examples/dataset/gigaspeech/.gitignore
--- a/examples/dataset/gigaspeech/README.md
+++ b/examples/dataset/gigaspeech/README.md
--- a/examples/dataset/gigaspeech/gigaspeech.py
+++ b/examples/dataset/gigaspeech/gigaspeech.py
--- a/examples/dataset/gigaspeech/run.sh
+++ b/examples/dataset/gigaspeech/run.sh
--- a/examples/dataset/librispeech/.gitignore
+++ b/examples/dataset/librispeech/.gitignore
--- a/examples/dataset/librispeech/librispeech.py
+++ b/examples/dataset/librispeech/librispeech.py
--- a/examples/dataset/magicdata/README.md
+++ b/examples/dataset/magicdata/README.md
--- a/examples/dataset/mini_librispeech/.gitignore
+++ b/examples/dataset/mini_librispeech/.gitignore
--- a/examples/dataset/mini_librispeech/mini_librispeech.py
+++ b/examples/dataset/mini_librispeech/mini_librispeech.py
--- a/examples/dataset/multi_cn/README.md
+++ b/examples/dataset/multi_cn/README.md
--- a/examples/dataset/primewords/README.md
+++ b/examples/dataset/primewords/README.md
--- a/examples/dataset/st-cmds/README.md
+++ b/examples/dataset/st-cmds/README.md
--- a/examples/dataset/ted_en_zh/.gitignore
+++ b/examples/dataset/ted_en_zh/.gitignore
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
--- a/examples/dataset/thchs30/.gitignore
+++ b/examples/dataset/thchs30/.gitignore
--- a/examples/dataset/thchs30/README.md
+++ b/examples/dataset/thchs30/README.md
--- a/examples/dataset/thchs30/thchs30.py
+++ b/examples/dataset/thchs30/thchs30.py
--- a/examples/dataset/timit/.gitignore
+++ b/examples/dataset/timit/.gitignore
--- a/examples/dataset/timit/timit.py
+++ b/examples/dataset/timit/timit.py
--- a/examples/dataset/timit/timit_kaldi_standard_split.py
+++ b/examples/dataset/timit/timit_kaldi_standard_split.py
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
--- a/examples/librispeech/s0/README.md
+++ b/examples/librispeech/s0/README.md
--- a/examples/librispeech/s0/conf/augmentation.json
+++ b/examples/librispeech/s0/conf/augmentation.json
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
--- a/examples/librispeech/s0/local/download_lm_en.sh
+++ b/examples/librispeech/s0/local/download_lm_en.sh
--- a/examples/librispeech/s0/local/export.sh
+++ b/examples/librispeech/s0/local/export.sh
--- a/examples/librispeech/s0/local/test.sh
+++ b/examples/librispeech/s0/local/test.sh
--- a/examples/librispeech/s0/local/train.sh
+++ b/examples/librispeech/s0/local/train.sh
--- a/examples/librispeech/s0/local/tune.sh
+++ b/examples/librispeech/s0/local/tune.sh
--- a/examples/librispeech/s0/path.sh
+++ b/examples/librispeech/s0/path.sh
--- a/examples/librispeech/s0/run.sh
+++ b/examples/librispeech/s0/run.sh
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
--- a/examples/librispeech/s1/conf/chunk_confermer.yaml
+++ b/examples/librispeech/s1/conf/chunk_confermer.yaml
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
--- a/examples/librispeech/s1/local/align.sh
+++ b/examples/librispeech/s1/local/align.sh
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
--- a/examples/librispeech/s1/local/download_lm_en.sh
+++ b/examples/librispeech/s1/local/download_lm_en.sh
--- a/examples/librispeech/s1/local/export.sh
+++ b/examples/librispeech/s1/local/export.sh
--- a/examples/librispeech/s1/local/test.sh
+++ b/examples/librispeech/s1/local/test.sh
--- a/examples/librispeech/s1/local/train.sh
+++ b/examples/librispeech/s1/local/train.sh
--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
--- a/examples/librispeech/s1/utils
+++ b/examples/librispeech/s1/utils
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
--- a/examples/librispeech/s2/conf/augmentation.json
+++ b/examples/librispeech/s2/conf/augmentation.json
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
--- a/examples/librispeech/s2/local/align.sh
+++ b/examples/librispeech/s2/local/align.sh
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
--- a/examples/librispeech/s2/local/download_lm_en.sh
+++ b/examples/librispeech/s2/local/download_lm_en.sh
--- a/examples/librispeech/s2/local/export.sh
+++ b/examples/librispeech/s2/local/export.sh
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
--- a/examples/librispeech/s2/local/train.sh
+++ b/examples/librispeech/s2/local/train.sh
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
--- a/examples/librispeech/s2/utils
+++ b/examples/librispeech/s2/utils
--- a/examples/ngram_lm/README.md
+++ b/examples/ngram_lm/README.md
--- a/examples/ngram_lm/data/README.md
+++ b/examples/ngram_lm/data/README.md
--- a/examples/ngram_lm/data/custom_confusion.txt
+++ b/examples/ngram_lm/data/custom_confusion.txt
--- a/examples/ngram_lm/data/text_correct.txt
+++ b/examples/ngram_lm/data/text_correct.txt
--- a/examples/ngram_lm/local/build_zh_lm.sh
+++ b/examples/ngram_lm/local/build_zh_lm.sh
--- a/examples/ngram_lm/local/download_lm_zh.sh
+++ b/examples/ngram_lm/local/download_lm_zh.sh
--- a/examples/ngram_lm/local/kenlm_score_test.py
+++ b/examples/ngram_lm/local/kenlm_score_test.py
--- a/examples/ngram_lm/path.sh
+++ b/examples/ngram_lm/path.sh
--- a/examples/ngram_lm/requirements.txt
+++ b/examples/ngram_lm/requirements.txt
--- a/examples/ngram_lm/run.sh
+++ b/examples/ngram_lm/run.sh
--- a/examples/spm/path.sh
+++ b/examples/spm/path.sh
--- a/examples/ted_en_zh/README.md
+++ b/examples/ted_en_zh/README.md
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
--- a/examples/ted_en_zh/t0/local/test.sh
+++ b/examples/ted_en_zh/t0/local/test.sh
--- a/examples/ted_en_zh/t0/local/train.sh
+++ b/examples/ted_en_zh/t0/local/train.sh
--- a/examples/ted_en_zh/t0/path.sh
+++ b/examples/ted_en_zh/t0/path.sh
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
--- a/examples/text_normalization/README.md
+++ b/examples/text_normalization/README.md
--- a/examples/text_normalization/data/sentences.txt
+++ b/examples/text_normalization/data/sentences.txt
--- a/examples/text_normalization/local/test_normalization.py
+++ b/examples/text_normalization/local/test_normalization.py
--- a/examples/text_normalization/path.sh
+++ b/examples/text_normalization/path.sh
--- a/examples/text_normalization/run.sh
+++ b/examples/text_normalization/run.sh
--- a/examples/thchs30/README.md
+++ b/examples/thchs30/README.md
--- a/examples/thchs30/a0/README.md
+++ b/examples/thchs30/a0/README.md
--- a/examples/thchs30/a0/data/dict/syllable.lexicon
+++ b/examples/thchs30/a0/data/dict/syllable.lexicon
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
--- a/examples/thchs30/a0/path.sh
+++ b/examples/thchs30/a0/path.sh
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
--- a/examples/timit/README.md
+++ b/examples/timit/README.md
--- a/examples/timit/s1/README.md
+++ b/examples/timit/s1/README.md
--- a/examples/timit/s1/conf/augmentation.json
+++ b/examples/timit/s1/conf/augmentation.json
--- a/examples/timit/s1/conf/dev_spk.list
+++ b/examples/timit/s1/conf/dev_spk.list
--- a/examples/timit/s1/conf/phones.60-48-39.map
+++ b/examples/timit/s1/conf/phones.60-48-39.map
--- a/examples/timit/s1/conf/test_spk.list
+++ b/examples/timit/s1/conf/test_spk.list
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
--- a/examples/timit/s1/local/align.sh
+++ b/examples/timit/s1/local/align.sh
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
--- a/examples/timit/s1/local/export.sh
+++ b/examples/timit/s1/local/export.sh
--- a/examples/timit/s1/local/test.sh
+++ b/examples/timit/s1/local/test.sh
--- a/examples/timit/s1/local/timit_data_prep.sh
+++ b/examples/timit/s1/local/timit_data_prep.sh
--- a/examples/timit/s1/local/timit_norm_trans.pl
+++ b/examples/timit/s1/local/timit_norm_trans.pl
--- a/examples/timit/s1/local/train.sh
+++ b/examples/timit/s1/local/train.sh
--- a/examples/timit/s1/path.sh
+++ b/examples/timit/s1/path.sh
--- a/examples/timit/s1/run.sh
+++ b/examples/timit/s1/run.sh
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
--- a/examples/tiny/s0/local/download_lm_en.sh
+++ b/examples/tiny/s0/local/download_lm_en.sh
--- a/examples/tiny/s0/local/export.sh
+++ b/examples/tiny/s0/local/export.sh
--- a/examples/tiny/s0/local/test.sh
+++ b/examples/tiny/s0/local/test.sh
--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
--- a/examples/tiny/s0/local/tune.sh
+++ b/examples/tiny/s0/local/tune.sh
--- a/examples/tiny/s0/path.sh
+++ b/examples/tiny/s0/path.sh
--- a/examples/tiny/s0/run.sh
+++ b/examples/tiny/s0/run.sh
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
--- a/examples/tiny/s1/local/align.sh
+++ b/examples/tiny/s1/local/align.sh
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
--- a/examples/tiny/s1/local/download_lm_en.sh
+++ b/examples/tiny/s1/local/download_lm_en.sh
--- a/examples/tiny/s1/local/export.sh
+++ b/examples/tiny/s1/local/export.sh
--- a/examples/tiny/s1/local/test.sh
+++ b/examples/tiny/s1/local/test.sh
--- a/examples/tiny/s1/local/train.sh
+++ b/examples/tiny/s1/local/train.sh
--- a/examples/tiny/s1/path.sh
+++ b/examples/tiny/s1/path.sh
--- a/examples/tiny/s1/run.sh
+++ b/examples/tiny/s1/run.sh
--- a/requirements.txt
+++ b/requirements.txt
--- a/setup.sh
+++ b/setup.sh
--- a/speechnn/CMakeLists.txt
+++ b/speechnn/CMakeLists.txt
--- a/speechnn/core/CMakeLists.txt
+++ b/speechnn/core/CMakeLists.txt
--- a/speechnn/core/decoder/CMakeLists.txt
+++ b/speechnn/core/decoder/CMakeLists.txt
--- a/speechnn/core/frontend/CMakeLists.txt
+++ b/speechnn/core/frontend/CMakeLists.txt
--- a/speechnn/core/frontend/audio/CMakeLists.txt
+++ b/speechnn/core/frontend/audio/CMakeLists.txt
--- a/speechnn/core/frontend/text/CMakeLists.txt
+++ b/speechnn/core/frontend/text/CMakeLists.txt
--- a/speechnn/core/model/CMakeLists.txt
+++ b/speechnn/core/model/CMakeLists.txt
--- a/speechnn/core/protocol/CMakeLists.txt
+++ b/speechnn/core/protocol/CMakeLists.txt
--- a/speechnn/core/utils/CMakeLists.txt
+++ b/speechnn/core/utils/CMakeLists.txt
--- a/speechnn/third_party/CMakeLists.txt
+++ b/speechnn/third_party/CMakeLists.txt
--- a/tests/deepspeech2_model_test.py
+++ b/tests/deepspeech2_model_test.py
--- a/tests/deepspeech2_online_model_test.py
+++ b/tests/deepspeech2_online_model_test.py
--- a/third_party/nnAudio/.gitignore
+++ b/third_party/nnAudio/.gitignore
--- a/third_party/nnAudio/nnAudio/Spectrogram.py
+++ b/third_party/nnAudio/nnAudio/Spectrogram.py
--- a/third_party/nnAudio/nnAudio/__init__.py
+++ b/third_party/nnAudio/nnAudio/__init__.py
--- a/third_party/nnAudio/nnAudio/librosa_functions.py
+++ b/third_party/nnAudio/nnAudio/librosa_functions.py
--- a/third_party/nnAudio/nnAudio/utils.py
+++ b/third_party/nnAudio/nnAudio/utils.py
--- a/third_party/nnAudio/setup.py
+++ b/third_party/nnAudio/setup.py
--- a/third_party/nnAudio/tests/parameters.py
+++ b/third_party/nnAudio/tests/parameters.py
--- a/third_party/nnAudio/tests/test_spectrogram.py
+++ b/third_party/nnAudio/tests/test_spectrogram.py
--- a/third_party/paddle_audio/frontend.py
+++ b/third_party/paddle_audio/frontend.py
--- a/third_party/text_processing/__ini__.py
+++ b/third_party/text_processing/__ini__.py
--- a/third_party/text_processing/__init__.py
+++ b/third_party/text_processing/__init__.py
--- a/third_party/text_processing/normalization/__init__.py
+++ b/third_party/text_processing/normalization/__init__.py
--- a/third_party/text_processing/normalization/char_convert.py
+++ b/third_party/text_processing/normalization/char_convert.py
--- a/third_party/text_processing/normalization/chronology.py
+++ b/third_party/text_processing/normalization/chronology.py
--- a/third_party/text_processing/normalization/constants.py
+++ b/third_party/text_processing/normalization/constants.py
--- a/third_party/text_processing/normalization/num.py
+++ b/third_party/text_processing/normalization/num.py
--- a/third_party/text_processing/normalization/phone.py
+++ b/third_party/text_processing/normalization/phone.py
--- a/third_party/text_processing/normalization/quantifier.py
+++ b/third_party/text_processing/normalization/quantifier.py
--- a/third_party/text_processing/normalization/sentence_split.py
+++ b/third_party/text_processing/normalization/sentence_split.py
--- a/tools/Makefile
+++ b/tools/Makefile
--- a/tools/extras/README.md
+++ b/tools/extras/README.md
--- a/tools/extras/install_gcc.sh
+++ b/tools/extras/install_gcc.sh
--- a/tools/extras/install_kaldi.sh
+++ b/tools/extras/install_kaldi.sh
--- a/tools/extras/install_kenlm.sh
+++ b/tools/extras/install_kenlm.sh
--- a/tools/extras/install_liblbfgs.sh
+++ b/tools/extras/install_liblbfgs.sh
--- a/tools/extras/install_mfa.sh
+++ b/tools/extras/install_mfa.sh
--- a/tools/extras/install_miniconda.sh
+++ b/tools/extras/install_miniconda.sh
--- a/tools/extras/install_mkl.sh
+++ b/tools/extras/install_mkl.sh
--- a/tools/extras/install_ngram.sh
+++ b/tools/extras/install_ngram.sh
--- a/tools/extras/install_openblas.sh
+++ b/tools/extras/install_openblas.sh
--- a/tools/extras/install_openfst.sh
+++ b/tools/extras/install_openfst.sh
--- a/tools/extras/install_pynini.sh
+++ b/tools/extras/install_pynini.sh
--- a/tools/extras/install_srilm.sh
+++ b/tools/extras/install_srilm.sh
--- a/tools/extras/srilm.patch
+++ b/tools/extras/srilm.patch
--- a/utils/__init__.py
+++ b/utils/__init__.py
--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
--- a/utils/duration_from_maniefst.sh
+++ b/utils/duration_from_maniefst.sh
--- a/utils/filter_scp.pl
+++ b/utils/filter_scp.pl
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
--- a/utils/fst/add_lex_disambig.pl
+++ b/utils/fst/add_lex_disambig.pl
--- a/utils/fst/compile_lexicon_token_fst.sh
+++ b/utils/fst/compile_lexicon_token_fst.sh
--- a/utils/fst/ctc_token_fst.py
+++ b/utils/fst/ctc_token_fst.py
--- a/utils/fst/ctc_token_fst_corrected.py
+++ b/utils/fst/ctc_token_fst_corrected.py
--- a/utils/fst/eps2disambig.pl
+++ b/utils/fst/eps2disambig.pl
--- a/utils/fst/make_lexicon_fst.pl
+++ b/utils/fst/make_lexicon_fst.pl
--- a/utils/fst/make_tlg.sh
+++ b/utils/fst/make_tlg.sh
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
--- a/utils/fst/remove_oovs.pl
+++ b/utils/fst/remove_oovs.pl
--- a/utils/fst/rnnt_token_fst.py
+++ b/utils/fst/rnnt_token_fst.py
--- a/utils/fst/s2eps.pl
+++ b/utils/fst/s2eps.pl
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
--- a/utils/ngram_train.sh
+++ b/utils/ngram_train.sh
--- a/utils/tarball.sh
+++ b/utils/tarball.sh
--- a/utils/utility.py
+++ b/utils/utility.py