Merge branch 'develop' of https://github.com/PaddlePaddle/DeepSpeech into ds2_online

61fe292c · huangyuxin · 718ae52e · 99050b70 · 61fe292c · 61fe292c
79 changed file
--- a/.bashrc
+++ b/.bashrc
+unset GREP_OPTIONS
+# https://zhuanlan.zhihu.com/p/33050965
+alias nvs='nvidia-smi'
+alias his='history'
+alias jobs='jobs -l'
+alias ports='netstat -tulanp'
+alias wget='wget -c'
+## Colorize the grep command output for ease of use (good for log files)##
+alias grep='grep --color=auto'
+alias egrep='egrep --color=auto'
+alias fgrep='fgrep --color=auto'
--- a/.gitignore
+++ b/.gitignore
@@ -10,11 +10,13 @@
 .ipynb_checkpoints
 *.npz
 *.done
+*.whl
 tools/venv
 tools/kenlm
 tools/sox-14.4.2
 tools/soxbindings
+tools/montreal-forced-aligner/
 tools/Montreal-Forced-Aligner/
 *output/
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@
 ## Setup
 * python>=3.7
-* paddlepaddle>=2.1.0
+* paddlepaddle>=2.1.2
 Please see [install](doc/src/install.md).

--- a/README_cn.md
+++ b/README_cn.md
@@ -17,7 +17,7 @@
 ## 安装
 * python>=3.7
-* paddlepaddle>=2.1.0
+* paddlepaddle>=2.1.2
 参看 [安装](doc/src/install.md)。

--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
@@ -84,9 +84,8 @@ FILES = glob.glob('kenlm/util/*.cc') \
 FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
 FILES = [
-    fn for fn in FILES
+    fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
-    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
+                               or fn.endswith('unittest.cc'))
-        'unittest.cc'))
 ]
 LIBS = ['stdc++']

--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -31,8 +31,8 @@ from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2 import U2Model
-from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
+from deepspeech.training.optimizer import OptimizerFactory
-from deepspeech.training.scheduler import WarmupLR
+from deepspeech.training.scheduler import LRSchedulerFactory
 from deepspeech.training.trainer import Trainer
 from deepspeech.utils import ctc_utils
 from deepspeech.utils import error_rate
@@ -312,30 +312,38 @@ class U2Trainer(Trainer):
        scheduler_type = train_config.scheduler
        scheduler_conf = train_config.scheduler_conf
-        grad_clip = ClipGradByGlobalNormWithLog(train_config.global_grad_clip)
+        scheduler_args = {
-        weight_decay = paddle.regularizer.L2Decay(optim_conf.weight_decay)
+            "learning_rate": optim_conf.lr,
+            "verbose": False,
-        if scheduler_type == 'expdecaylr':
+            "warmup_steps": scheduler_conf.warmup_steps,
-            lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
+            "gamma": scheduler_conf.lr_decay,
-                learning_rate=optim_conf.lr,
+            "d_model": model_conf.encoder_conf.output_size,
-                gamma=scheduler_conf.lr_decay,
+        }
-                verbose=False)
+        lr_scheduler = LRSchedulerFactory.from_args(scheduler_type,
-        elif scheduler_type == 'warmuplr':
+                                                    scheduler_args)
-            lr_scheduler = WarmupLR(
-                learning_rate=optim_conf.lr,
+        def optimizer_args(
-                warmup_steps=scheduler_conf.warmup_steps,
+                config,
-                verbose=False)
+                parameters,
-        else:
+                lr_scheduler=None, ):
-            raise ValueError(f"Not support scheduler: {scheduler_type}")
+            train_config = config.training
+            optim_type = train_config.optim
-        if optim_type == 'adam':
+            optim_conf = train_config.optim_conf
-            optimizer = paddle.optimizer.Adam(
+            scheduler_type = train_config.scheduler
-                learning_rate=lr_scheduler,
+            scheduler_conf = train_config.scheduler_conf
-                parameters=model.parameters(),
+            return {
-                weight_decay=weight_decay,
+                "grad_clip": train_config.global_grad_clip,
-                grad_clip=grad_clip)
+                "weight_decay": optim_conf.weight_decay,
-        else:
+                "learning_rate": lr_scheduler
-            raise ValueError(f"Not support optim: {optim_type}")
+                if lr_scheduler else optim_conf.lr,
+                "parameters": parameters,
+                "epsilon": 1e-9 if optim_type == 'noam' else None,
+                "beta1": 0.9 if optim_type == 'noam' else None,
+                "beat2": 0.98 if optim_type == 'noam' else None,
+            }
+        optimzer_args = optimizer_args(config, model.parameters(), lr_scheduler)
+        optimizer = OptimizerFactory.from_args(optim_type, optimzer_args)
        self.model = model
        self.optimizer = optimizer

--- a/deepspeech/exps/u2_st/__init__.py
+++ b/deepspeech/exps/u2_st/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech/exps/u2_st/bin/export.py
+++ b/deepspeech/exps/u2_st/bin/export.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Export for U2 model."""
+from deepspeech.exps.u2_st.config import get_cfg_defaults
+from deepspeech.exps.u2_st.model import U2STTester as Tester
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+def main_sp(config, args):
+    exp = Tester(config, args)
+    exp.setup()
+    exp.run_export()
+def main(config, args):
+    main_sp(config, args)
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+    main(config, args)
--- a/deepspeech/exps/u2_st/bin/test.py
+++ b/deepspeech/exps/u2_st/bin/test.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+from deepspeech.exps.u2_st.config import get_cfg_defaults
+from deepspeech.exps.u2_st.model import U2STTester as Tester
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+# TODO(hui zhang): dynamic load 
+def main_sp(config, args):
+    exp = Tester(config, args)
+    exp.setup()
+    exp.run_test()
+def main(config, args):
+    main_sp(config, args)
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
--- a/deepspeech/exps/u2_st/bin/train.py
+++ b/deepspeech/exps/u2_st/bin/train.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+from paddle import distributed as dist
+from deepspeech.exps.u2_st.config import get_cfg_defaults
+from deepspeech.exps.u2_st.model import U2STTrainer as Trainer
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.utility import print_arguments
+def main_sp(config, args):
+    exp = Trainer(config, args)
+    exp.setup()
+    exp.run()
+def main(config, args):
+    if args.device == "gpu" and args.nprocs > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    else:
+        main_sp(config, args)
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    args = parser.parse_args()
+    print_arguments(args, globals())
+    # https://yaml.org/type/float.html
+    config = get_cfg_defaults()
+    if args.config:
+        config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
--- a/deepspeech/exps/u2_st/config.py
+++ b/deepspeech/exps/u2_st/config.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from yacs.config import CfgNode
+from deepspeech.exps.u2_st.model import U2STTester
+from deepspeech.exps.u2_st.model import U2STTrainer
+from deepspeech.io.collator_st import SpeechCollator
+from deepspeech.io.dataset import ManifestDataset
+from deepspeech.models.u2_st import U2STModel
+_C = CfgNode()
+_C.data = ManifestDataset.params()
+_C.collator = SpeechCollator.params()
+_C.model = U2STModel.params()
+_C.training = U2STTrainer.params()
+_C.decoding = U2STTester.params()
+def get_cfg_defaults():
+    """Get a yacs CfgNode object with default values for my_project."""
+    # Return a clone so that the defaults will not be altered
+    # This is for the "local variable" use pattern
+    config = _C.clone()
+    config.set_new_allowed(True)
+    return config
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
--- a/deepspeech/io/collator_st.py
+++ b/deepspeech/io/collator_st.py
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -19,9 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
-__all__ = [
+__all__ = ["ManifestDataset", "TripletManifestDataset"]
-    "ManifestDataset",
-]
 logger = Log(__name__).getlog()
@@ -105,3 +103,16 @@ class ManifestDataset(Dataset):
    def __getitem__(self, idx):
        instance = self._manifest[idx]
        return instance["utt"], instance["feat"], instance["text"]
+class TripletManifestDataset(ManifestDataset):
+    """
+        For Joint Training of Speech Translation and ASR.
+        text: translation,
+        text1: transcript.
+    """
+    def __getitem__(self, idx):
+        instance = self._manifest[idx]
+        return instance["utt"], instance["feat"], instance["text"], instance[
+            "text1"]
--- a/deepspeech/models/deepspeech2.py
+++ b/deepspeech/models/deepspeech2.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Deepspeech2 ASR Model"""
-from typing import Optional
-import paddle
-from paddle import nn
-from yacs.config import CfgNode
-from deepspeech.modules.conv import ConvStack
-from deepspeech.modules.ctc import CTCDecoder
-from deepspeech.modules.rnn import RNNStack
-from deepspeech.utils import layer_tools
-from deepspeech.utils.checkpoint import Checkpoint
-from deepspeech.utils.log import Log
-logger = Log(__name__).getlog()
-__all__ = ['DeepSpeech2Model']
-class CRNNEncoder(nn.Layer):
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True):
-        super().__init__()
-        self.rnn_size = rnn_size
-        self.feat_size = feat_size  # 161 for linear
-        self.dict_size = dict_size
-        self.conv = ConvStack(feat_size, num_conv_layers)
-        i_size = self.conv.output_height  # H after conv stack
-        self.rnn = RNNStack(
-            i_size=i_size,
-            h_size=rnn_size,
-            num_stacks=num_rnn_layers,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
-    @property
-    def output_size(self):
-        return self.rnn_size * 2
-    def forward(self, audio, audio_len):
-        """Compute Encoder outputs
-        Args:
-            audio (Tensor): [B, Tmax, D]
-            text (Tensor): [B, Umax]
-            audio_len (Tensor): [B]
-            text_len (Tensor): [B]
-        Returns:
-            x (Tensor): encoder outputs, [B, T, D]
-            x_lens (Tensor): encoder length, [B]
-        """
-        # [B, T, D]  -> [B, D, T]
-        audio = audio.transpose([0, 2, 1])
-        # [B, D, T] -> [B, C=1, D, T]
-        x = audio.unsqueeze(1)
-        x_lens = audio_len
-        # convolution group
-        x, x_lens = self.conv(x, x_lens)
-        # convert data from convolution feature map to sequence of vectors
-        #B, C, D, T = paddle.shape(x)  # not work under jit
-        x = x.transpose([0, 3, 1, 2])  #[B, T, C, D]
-        #x = x.reshape([B, T, C * D])  #[B, T, C*D]  # not work under jit
-        x = x.reshape([0, 0, -1])  #[B, T, C*D]
-        # remove padding part
-        x, x_lens = self.rnn(x, x_lens)  #[B, T, D]
-        return x, x_lens
-class DeepSpeech2Model(nn.Layer):
-    """The DeepSpeech2 network structure.
-    :param audio_data: Audio spectrogram data layer.
-    :type audio_data: Variable
-    :param text_data: Transcription text data layer.
-    :type text_data: Variable
-    :param audio_len: Valid sequence length data layer.
-    :type audio_len: Variable
-    :param masks: Masks data layer to reset padding.
-    :type masks: Variable
-    :param dict_size: Dictionary size for tokenized transcription.
-    :type dict_size: int
-    :param num_conv_layers: Number of stacking convolution layers.
-    :type num_conv_layers: int
-    :param num_rnn_layers: Number of stacking RNN layers.
-    :type num_rnn_layers: int
-    :param rnn_size: RNN layer size (dimension of RNN cells).
-    :type rnn_size: int
-    :param use_gru: Use gru if set True. Use simple rnn if set False.
-    :type use_gru: bool
-    :param share_rnn_weights: Whether to share input-hidden weights between
-                              forward and backward direction RNNs.
-                              It is only available when use_gru=False.
-    :type share_weights: bool
-    :return: A tuple of an output unnormalized log probability layer (
-             before softmax) and a ctc cost layer.
-    :rtype: tuple of LayerOutput    
-    """
-    @classmethod
-    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
-        default = CfgNode(
-            dict(
-                num_conv_layers=2,  #Number of stacking convolution layers.
-                num_rnn_layers=3,  #Number of stacking RNN layers.
-                rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-                use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-                share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-            ))
-        if config is not None:
-            config.merge_from_other_cfg(default)
-        return default
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True):
-        super().__init__()
-        self.encoder = CRNNEncoder(
-            feat_size=feat_size,
-            dict_size=dict_size,
-            num_conv_layers=num_conv_layers,
-            num_rnn_layers=num_rnn_layers,
-            rnn_size=rnn_size,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
-        assert (self.encoder.output_size == rnn_size * 2)
-        self.decoder = CTCDecoder(
-            odim=dict_size,  # <blank> is in  vocab
-            enc_n_units=self.encoder.output_size,
-            blank_id=0,  # first token is <blank>
-            dropout_rate=0.0,
-            reduction=True,  # sum
-            batch_average=True)  # sum / batch_size
-    def forward(self, audio, audio_len, text, text_len):
-        """Compute Model loss
-        Args:
-            audio (Tenosr): [B, T, D]
-            audio_len (Tensor): [B]
-            text (Tensor): [B, U]
-            text_len (Tensor): [B]
-        Returns:
-            loss (Tenosr): [1]
-        """
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        loss = self.decoder(eouts, eouts_len, text, text_len)
-        return loss
-    @paddle.no_grad()
-    def decode(self, audio, audio_len, vocab_list, decoding_method,
-               lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
-               cutoff_top_n, num_processes):
-        # init once
-        # decoders only accept string encoded in utf-8
-        self.decoder.init_decode(
-            beam_alpha=beam_alpha,
-            beam_beta=beam_beta,
-            lang_model_path=lang_model_path,
-            vocab_list=vocab_list,
-            decoding_method=decoding_method)
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        probs = self.decoder.softmax(eouts)
-        return self.decoder.decode_probs(
-            probs.numpy(), eouts_len, vocab_list, decoding_method,
-            lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob,
-            cutoff_top_n, num_processes)
-    @classmethod
-    def from_pretrained(cls, dataloader, config, checkpoint_path):
-        """Build a DeepSpeech2Model model from a pretrained model.
-        Parameters
-        ----------
-        dataloader: paddle.io.DataLoader
-        config: yacs.config.CfgNode
-            model configs
-        checkpoint_path: Path or str
-            the path of pretrained model checkpoint, without extension name
-        Returns
-        -------
-        DeepSpeech2Model
-            The model built from pretrained result.
-        """
-        model = cls(feat_size=dataloader.collate_fn.feature_size,
-                    dict_size=dataloader.collate_fn.vocab_size,
-                    num_conv_layers=config.model.num_conv_layers,
-                    num_rnn_layers=config.model.num_rnn_layers,
-                    rnn_size=config.model.rnn_layer_size,
-                    use_gru=config.model.use_gru,
-                    share_rnn_weights=config.model.share_rnn_weights)
-        infos = Checkpoint().load_parameters(
-            model, checkpoint_path=checkpoint_path)
-        logger.info(f"checkpoint info: {infos}")
-        layer_tools.summary(model)
-        return model
-class DeepSpeech2InferModel(DeepSpeech2Model):
-    def __init__(self,
-                 feat_size,
-                 dict_size,
-                 num_conv_layers=2,
-                 num_rnn_layers=3,
-                 rnn_size=1024,
-                 use_gru=False,
-                 share_rnn_weights=True):
-        super().__init__(
-            feat_size=feat_size,
-            dict_size=dict_size,
-            num_conv_layers=num_conv_layers,
-            num_rnn_layers=num_rnn_layers,
-            rnn_size=rnn_size,
-            use_gru=use_gru,
-            share_rnn_weights=share_rnn_weights)
-    def forward(self, audio, audio_len):
-        """export model function
-        Args:
-            audio (Tensor): [B, T, D]
-            audio_len (Tensor): [B]
-        Returns:
-            probs: probs after softmax
-        """
-        eouts, eouts_len = self.encoder(audio, audio_len)
-        probs = self.decoder.softmax(eouts)
-        return probs
--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
--- a/deepspeech/training/gradclip.py
+++ b/deepspeech/training/gradclip.py
@@ -27,6 +27,9 @@ class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
    def __init__(self, clip_norm):
        super().__init__(clip_norm)
+    def __repr__(self):
+        return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        params_and_grads = []

--- a/deepspeech/training/optimizer.py
+++ b/deepspeech/training/optimizer.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from typing import Dict
+from typing import Text
+from paddle.optimizer import Optimizer
+from paddle.regularizer import L2Decay
+from deepspeech.training.gradclip import ClipGradByGlobalNormWithLog
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.dynamic_import import instance_class
+from deepspeech.utils.log import Log
+__all__ = ["OptimizerFactory"]
+logger = Log(__name__).getlog()
+OPTIMIZER_DICT = {
+    "sgd": "paddle.optimizer:SGD",
+    "momentum": "paddle.optimizer:Momentum",
+    "adadelta": "paddle.optimizer:Adadelta",
+    "adam": "paddle.optimizer:Adam",
+    "adamw": "paddle.optimizer:AdamW",
+}
+def register_optimizer(cls):
+    """Register optimizer."""
+    alias = cls.__name__.lower()
+    OPTIMIZER_DICT[cls.__name__.lower()] = cls.__module__ + ":" + cls.__name__
+    return cls
+def dynamic_import_optimizer(module):
+    """Import Optimizer class dynamically.
+    Args:
+        module (str): module_name:class_name or alias in `OPTIMIZER_DICT`
+    Returns:
+        type: Optimizer class
+    """
+    module_class = dynamic_import(module, OPTIMIZER_DICT)
+    assert issubclass(module_class,
+                      Optimizer), f"{module} does not implement Optimizer"
+    return module_class
+class OptimizerFactory():
+    @classmethod
+    def from_args(cls, name: str, args: Dict[Text, Any]):
+        assert "parameters" in args, "parameters not in args."
+        assert "learning_rate" in args, "learning_rate not in args."
+        grad_clip = ClipGradByGlobalNormWithLog(
+            args['grad_clip']) if "grad_clip" in args else None
+        weight_decay = L2Decay(
+            args['weight_decay']) if "weight_decay" in args else None
+        module_class = dynamic_import_optimizer(name.lower())
+        if weight_decay:
+            logger.info(f'WeightDecay: {weight_decay}')
+        if grad_clip:
+            logger.info(f'GradClip: {grad_clip}')
+        logger.info(
+            f"Optimizer: {module_class.__name__} {args['learning_rate']}")
+        args.update({"grad_clip": grad_clip, "weight_decay": weight_decay})
+        return instance_class(module_class, args)
--- a/deepspeech/training/scheduler.py
+++ b/deepspeech/training/scheduler.py
@@ -11,18 +11,53 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Any
+from typing import Dict
+from typing import Text
 from typing import Union
 from paddle.optimizer.lr import LRScheduler
 from typeguard import check_argument_types
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.dynamic_import import instance_class
 from deepspeech.utils.log import Log
-__all__ = ["WarmupLR"]
+__all__ = ["WarmupLR", "LRSchedulerFactory"]
 logger = Log(__name__).getlog()
+SCHEDULER_DICT = {
+    "noam": "paddle.optimizer.lr:NoamDecay",
+    "expdecaylr": "paddle.optimizer.lr:ExponentialDecay",
+    "piecewisedecay": "paddle.optimizer.lr:PiecewiseDecay",
+}
+def register_scheduler(cls):
+    """Register scheduler."""
+    alias = cls.__name__.lower()
+    SCHEDULER_DICT[cls.__name__.lower()] = cls.__module__ + ":" + cls.__name__
+    return cls
+def dynamic_import_scheduler(module):
+    """Import Scheduler class dynamically.
+    Args:
+        module (str): module_name:class_name or alias in `SCHEDULER_DICT`
+    Returns:
+        type: Scheduler class
+    """
+    module_class = dynamic_import(module, SCHEDULER_DICT)
+    assert issubclass(module_class,
+                      LRScheduler), f"{module} does not implement LRScheduler"
+    return module_class
+@register_scheduler
 class WarmupLR(LRScheduler):
    """The WarmupLR scheduler
    This scheduler is almost same as NoamLR Scheduler except for following
@@ -40,7 +75,8 @@ class WarmupLR(LRScheduler):
                 warmup_steps: Union[int, float]=25000,
                 learning_rate=1.0,
                 last_epoch=-1,
-                 verbose=False):
+                 verbose=False,
+                 **kwargs):
        assert check_argument_types()
        self.warmup_steps = warmup_steps
        super().__init__(learning_rate, last_epoch, verbose)
@@ -64,3 +100,10 @@ class WarmupLR(LRScheduler):
            None
        '''
        self.step(epoch=step)
+class LRSchedulerFactory():
+    @classmethod
+    def from_args(cls, name: str, args: Dict[Text, Any]):
+        module_class = dynamic_import_scheduler(name.lower())
+        return instance_class(module_class, args)
--- a/deepspeech/utils/bleu_score.py
+++ b/deepspeech/utils/bleu_score.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""This module provides functions to calculate bleu score in different level.
+e.g. wer for word-level, cer for char-level.
+"""
+import sacrebleu
+__all__ = ['bleu', 'char_bleu']
+def bleu(hypothesis, reference):
+    """Calculate BLEU. BLEU compares reference text and
+    hypothesis text in word-level using scarebleu.
+    :param reference: The reference sentences.
+    :type reference: list[list[str]]
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: list[str]
+    :raises ValueError: If the reference length is zero.
+    """
+    return sacrebleu.corpus_bleu(hypothesis, reference)
+def char_bleu(hypothesis, reference):
+    """Calculate BLEU. BLEU compares reference text and
+    hypothesis text in char-level using scarebleu.
+    :param reference: The reference sentences.
+    :type reference: list[list[str]]
+    :param hypothesis: The hypothesis sentence.
+    :type hypothesis: list[str]
+    :raises ValueError: If the reference number is zero.
+    """
+    hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis]
+    reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref]
+                 for ref in reference]
+    return sacrebleu.corpus_bleu(hypothesis, reference)
--- a/deepspeech/utils/dynamic_import.py
+++ b/deepspeech/utils/dynamic_import.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import importlib
+import inspect
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Text
+from deepspeech.utils.log import Log
+from deepspeech.utils.tensor_utils import has_tensor
+logger = Log(__name__).getlog()
+__all__ = ["dynamic_import", "instance_class"]
+def dynamic_import(import_path, alias=dict()):
+    """dynamic import module and class
+    :param str import_path: syntax 'module_name:class_name'
+        e.g., 'deepspeech.models.u2:U2Model'
+    :param dict alias: shortcut for registered class
+    :return: imported class
+    """
+    if import_path not in alias and ":" not in import_path:
+        raise ValueError("import_path should be one of {} or "
+                         'include ":", e.g. "deepspeech.models.u2:U2Model" : '
+                         "{}".format(set(alias), import_path))
+    if ":" not in import_path:
+        import_path = alias[import_path]
+    module_name, objname = import_path.split(":")
+    m = importlib.import_module(module_name)
+    return getattr(m, objname)
+def filter_valid_args(args: Dict[Text, Any], valid_keys: List[Text]):
+    # filter by `valid_keys` and filter `val` is not None
+    new_args = {
+        key: val
+        for key, val in args.items() if (key in valid_keys and val is not None)
+    }
+    return new_args
+def filter_out_tenosr(args: Dict[Text, Any]):
+    return {key: val for key, val in args.items() if not has_tensor(val)}
+def instance_class(module_class, args: Dict[Text, Any]):
+    valid_keys = inspect.signature(module_class).parameters.keys()
+    new_args = filter_valid_args(args, valid_keys)
+    logger.info(
+        f"Instance: {module_class.__name__} {filter_out_tenosr(new_args)}.")
+    return module_class(**new_args)
--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@@ -19,11 +19,25 @@ import paddle
 from deepspeech.utils.log import Log
-__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy"]
+__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"]
 logger = Log(__name__).getlog()
+def has_tensor(val):
+    if isinstance(val, (list, tuple)):
+        for item in val:
+            if has_tensor(item):
+                return True
+    elif isinstance(val, dict):
+        for k, v in val.items():
+            print(k)
+            if has_tensor(v):
+                return True
+    else:
+        return paddle.is_tensor(val)
 def pad_sequence(sequences: List[paddle.Tensor],
                 batch_first: bool=False,
                 padding_value: float=0.0) -> paddle.Tensor:

--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -32,7 +32,7 @@ collator:
  keep_transcription_text: False
  sortagrad: True
  shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2
 model:
  num_conv_layers: 2

--- a/examples/aishell/s0/path.sh
+++ b/examples/aishell/s0/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/aishell/s1/path.sh
+++ b/examples/aishell/s1/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/callcenter/s1/README.md
+++ b/examples/callcenter/s1/README.md
@@ -17,4 +17,4 @@
 | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | 2.23287845  | 0.087982 |  
 | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | 2.23287845  | 0.086962 |  
 | conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | 2.23287845 | 0.086741 |  
-| conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | 2.23287845 | 0.083495 | 
+| conformer | 45.73 M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | 2.23287845 | 0.083495 |
--- a/examples/callcenter/s1/path.sh
+++ b/examples/callcenter/s1/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/cc-cedict/path.sh
+++ b/examples/cc-cedict/path.sh
-export MAIN_ROOT=${PWD}/../../
+export MAIN_ROOT=`realpath ${PWD}/../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/chinese_g2p/path.sh
+++ b/examples/chinese_g2p/path.sh
-export MAIN_ROOT=${PWD}/../../
+export MAIN_ROOT=`realpath ${PWD}/../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/dataset/ted_en_zh/.gitignore
+++ b/examples/dataset/ted_en_zh/.gitignore
+*.tar.gz.*
+manifest.*
+*.md
+EN-ZH/
+train-split/
+test-segment/
\ No newline at end of file
--- a/examples/dataset/ted_en_zh/ted_en_zh.py
+++ b/examples/dataset/ted_en_zh/ted_en_zh.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare Ted-En-Zh speech translation dataset
+Create manifest files from splited datased. 
+dev set: tst2010, test set: tst2015
+Manifest file is a json-format file with each line containing the
+meta data (i.e. audio filepath, transcript and audio duration)
+of each audio file in the data set.
+"""
+import argparse
+import codecs
+import json
+import os
+import soundfile
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--src_dir",
+    default="",
+    type=str,
+    help="Directory to kaldi splited data. (default: %(default)s)")
+parser.add_argument(
+    "--manifest_prefix",
+    default="manifest",
+    type=str,
+    help="Filepath prefix for output manifests. (default: %(default)s)")
+args = parser.parse_args()
+def create_manifest(data_dir, manifest_path_prefix):
+    print("Creating manifest %s ..." % manifest_path_prefix)
+    json_lines = []
+    data_types_infos = [
+        ('train', 'train-split/train-segment', 'En-Zh/train.en-zh'),
+        ('dev', 'test-segment/tst2010', 'En-Zh/tst2010.en-zh'),
+        ('test', 'test-segment/tst2015', 'En-Zh/tst2015.en-zh')
+    ]
+    for data_info in data_types_infos:
+        dtype, audio_relative_dir, text_relative_path = data_info
+        del json_lines[:]
+        total_sec = 0.0
+        total_text = 0.0
+        total_num = 0
+        text_path = os.path.join(data_dir, text_relative_path)
+        audio_dir = os.path.join(data_dir, audio_relative_dir)
+        for line in codecs.open(text_path, 'r', 'utf-8', errors='ignore'):
+            line = line.strip()
+            if len(line) < 1:
+                continue
+            audio_id, trancription, translation = line.split('\t')
+            utt = audio_id.split('.')[0]
+            audio_path = os.path.join(audio_dir, audio_id)
+            if os.path.exists(audio_path):
+                if os.path.getsize(audio_path) < 30000:
+                    continue
+                audio_data, samplerate = soundfile.read(audio_path)
+                duration = float(len(audio_data) / samplerate)
+                json_lines.append(
+                    json.dumps(
+                        {
+                            'utt': utt,
+                            'feat': audio_path,
+                            'feat_shape': (duration, ),  # second
+                            'text': " ".join(translation.split()),
+                            'text1': " ".join(trancription.split())
+                        },
+                        ensure_ascii=False))
+                total_sec += duration
+                total_text += len(translation.split())
+                total_num += 1
+                if not total_num % 1000:
+                    print(dtype, 'Processed:', total_num)
+        manifest_path = manifest_path_prefix + '.' + dtype + '.raw'
+        with codecs.open(manifest_path, 'w', 'utf-8') as fout:
+            for line in json_lines:
+                fout.write(line + '\n')
+def prepare_dataset(src_dir, manifest_path=None):
+    """create manifest file."""
+    if os.path.isdir(manifest_path):
+        manifest_path = os.path.join(manifest_path, 'manifest')
+    if manifest_path:
+        create_manifest(src_dir, manifest_path)
+def main():
+    if args.src_dir.startswith('~'):
+        args.src_dir = os.path.expanduser(args.src_dir)
+    prepare_dataset(src_dir=args.src_dir, manifest_path=args.manifest_prefix)
+    print("manifest prepare done!")
+if __name__ == '__main__':
+    main()
--- a/examples/librispeech/README.md
+++ b/examples/librispeech/README.md
 # ASR
 * s0 is for deepspeech2 offline
 * s1 is for transformer/conformer/U2
+* s2 is for transformer/conformer/U2 w/ kaldi feat
+need install Kaldi
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -32,7 +32,7 @@ collator:
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2
 model:
  num_conv_layers: 2

--- a/examples/librispeech/s0/path.sh
+++ b/examples/librispeech/s0/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
 # LibriSpeech
-## Conformer
+## Data
+| Data Subset | Duration in Seconds |
+| --- | --- |
+| data/manifest.train |  0.83s ~ 29.735s |
+| data/manifest.dev | 1.065 ~ 35.155s |  
+| data/manifest.test-clean | 1.285s ~ 34.955s |
+## Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-all | attention | 6.35 | 0.057117 |  
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | 6.35 | 0.030162 |  
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 6.35 | 0.037910 |  
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 6.35 | 0.037761 |  
 | conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 6.35 | 0.032115 |  
+### Test w/o length filter
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention | 6.35 | 0.057117 |  
-## Chunk Conformer
+## Chunk Conformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- | --- |  
-| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.01250648 | 0.069548 |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 | 7.11 | 0.063193 |  
-| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | 7.01250648 | 0.094753 |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 | 7.11 | 0.082394 |  
-| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | 7.01250648 | - |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 | 7.11 | 0.082156 |  
-| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | 7.01250648 | - |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 | 7.11 | 0.071000 |  
 ## Transformer
 | Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- | --- |
-| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-all | attention | 6.98 | 0.066500 |  
 | transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention | 6.98 | 0.036 |  
+### Test w/o length filter
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | 6.98 | 0.066500 |  
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -16,7 +16,7 @@ collator:
  spm_model_prefix: 'data/bpe_unigram_5000'
  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
-  batch_size: 16
+  batch_size: 32
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
@@ -78,7 +78,7 @@ model:
 training:
  n_epoch: 120
-  accum_grad: 8
+  accum_grad: 4
  global_grad_clip: 3.0
  optim: adam
  optim_conf:

--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
-export PYTHONIOENCODING=UTF-8 
+export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
 export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/

--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
@@ -19,7 +19,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=4,5,6,7 ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path}  ${ckpt}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -29,7 +29,7 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
-    CUDA_VISIBLE_DEVICES=7 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
 fi
 if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then

--- a/examples/librispeech/s1/utils
+++ b/examples/librispeech/s1/utils
+../../../utils
\ No newline at end of file
--- a/examples/librispeech/s2/README.md
+++ b/examples/librispeech/s2/README.md
+# LibriSpeech
+## Data
+| Data Subset | Duration in Seconds |
+| data/manifest.train |  0.83s ~ 29.735s |
+| data/manifest.dev | 1.065 ~ 35.155s |  
+| data/manifest.test-clean | 1.285s ~ 34.955s |
+## Conformer
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention | - | - |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search |  |  |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search |  | |  
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean | attention_rescoring |  |  |  
+### Test w/o length filter
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 47.63 M | conf/conformer.yaml | spec_aug + shift | test-clean-all | attention |  |  |  
+## Chunk Conformer
+| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- | --- |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention | 16, -1 |  |  |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_greedy_search | 16, -1 |  |  |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | ctc_prefix_beam_search | 16, -1 |  | - |  
+| conformer | 47.63 M | conf/chunk_conformer.yaml | spec_aug + shift | test-clean | attention_rescoring | 16, -1 |  | - |  
+## Transformer
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean | attention |  |  |  
+### Test w/o length filter
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | WER |  
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| transformer | 32.52 M | conf/transformer.yaml | spec_aug + shift | test-clean-all | attention | | |  
--- a/examples/librispeech/s2/conf/augmentation.json
+++ b/examples/librispeech/s2/conf/augmentation.json
+[
+  {
+    "type": "shift",
+    "params": {
+      "min_shift_ms": -5,
+      "max_shift_ms": 5
+    },
+    "prob": 1.0
+  },
+  {
+    "type": "speed",
+    "params": {
+      "min_speed_rate": 0.9,
+      "max_speed_rate": 1.1,
+      "num_rates": 3
+    },
+    "prob": 0.0
+  },
+  {
+    "type": "specaug",
+    "params": {
+      "F": 10,
+      "T": 50,
+      "n_freq_masks": 2,
+      "n_time_masks": 2,
+      "p": 1.0,
+      "W": 80,
+      "adaptive_number_ratio": 0,
+      "adaptive_size_ratio": 0,
+      "max_n_time_masks": 20
+    },
+    "prob": 1.0
+  }
+]
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.5
+  max_input_len: 20.0
+  min_output_len: 0.0
+  max_output_len: 400.0
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 16
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: conformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: True
+        use_cnn_module: True
+        cnn_module_kernel: 15
+        activation_type: 'swish'
+        pos_enc_layer_type: 'rel_pos'
+        selfattention_layer_type: 'rel_selfattn'
+        causal: True
+        use_dynamic_chunk: true
+        cnn_module_norm: 'layer_norm' # using nn.LayerNorm makes model converge faster
+        use_dynamic_left_chunk: false
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+training:
+  n_epoch: 240
+  accum_grad: 8
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.001
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+decoding:
+  batch_size: 128
+  error_rate_type: wer
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.5  # second
+  max_input_len: 20.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+        use_dynamic_chunk: true
+        use_dynamic_left_chunk: false
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+training:
+  n_epoch: 120
+  accum_grad: 1
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.001
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+decoding:
+  batch_size: 64
+  error_rate_type: wer
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: true  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test-clean
+  min_input_len: 0.5  # seconds
+  max_input_len: 20.0 # seconds
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 16
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: conformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: True
+        use_cnn_module: True
+        cnn_module_kernel: 15
+        activation_type: 'swish'
+        pos_enc_layer_type: 'rel_pos'
+        selfattention_layer_type: 'rel_selfattn'
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+training:
+  n_epoch: 120
+  accum_grad: 8
+  global_grad_clip: 3.0
+  optim: adam
+  optim_conf:
+    lr: 0.004
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+decoding:
+  batch_size: 64
+  error_rate_type: wer
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test-clean
+  min_input_len: 0.5  # second
+  max_input_len: 20.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+collator:
+  vocab_filepath: data/vocab.txt
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_5000'
+  mean_std_filepath: ""
+  augmentation_config: conf/augmentation.json
+  batch_size: 64
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+    # hybrid CTC/attention
+    model_conf:
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+training:
+  n_epoch: 120
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.004
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 100
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+decoding:
+  batch_size: 64
+  error_rate_type: wer
+  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
+  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/librispeech/s2/local/align.sh
+++ b/examples/librispeech/s2/local/align.sh
+#!/bin/bash
+if [ $# != 2 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix"
+    exit -1
+fi
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+device=gpu
+if [ ${ngpu} == 0 ];then
+    device=cpu
+fi
+config_path=$1
+ckpt_prefix=$2
+batch_size=1
+output_dir=${ckpt_prefix}
+mkdir -p ${output_dir}
+# align dump in `result_file`
+# .tier, .TextGrid dump in `dir of result_file`
+python3 -u ${BIN_DIR}/alignment.py \
+--device ${device} \
+--nproc 1 \
+--config ${config_path} \
+--result_file ${output_dir}/${type}.align \
+--checkpoint_path ${ckpt_prefix} \
+--opts decoding.batch_size ${batch_size}
+if [ $? -ne 0 ]; then
+    echo "Failed in ctc alignment!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
+#!/bin/bash
+stage=-1
+stop_stage=100
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+bpeprefix="data/bpe_${bpemode}_${nbpe}"
+source ${MAIN_ROOT}/utils/parse_options.sh
+mkdir -p data
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # download data, generate manifests
+    python3 ${TARGET_DIR}/librispeech/librispeech.py \
+    --manifest_prefix="data/manifest" \
+    --target_dir="${TARGET_DIR}/librispeech" \
+    --full_download="True"
+    if [ $? -ne 0 ]; then
+        echo "Prepare LibriSpeech failed. Terminated."
+        exit 1
+    fi
+    for set in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
+        mv data/manifest.${set} data/manifest.${set}.raw
+    done
+    rm -rf data/manifest.train.raw data/manifest.dev.raw  data/manifest.test.raw
+    for set in train-clean-100 train-clean-360 train-other-500; do
+        cat data/manifest.${set}.raw >> data/manifest.train.raw
+    done
+    for set in dev-clean dev-other; do
+        cat data/manifest.${set}.raw >> data/manifest.dev.raw
+    done
+    for set in test-clean test-other; do
+        cat data/manifest.${set}.raw >> data/manifest.test.raw
+    done
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type "spm" \
+    --spm_vocab_size=${nbpe} \
+    --spm_mode ${bpemode} \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --manifest_paths="data/manifest.train.raw"
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --num_samples=-1 \
+    --specgram_type="fbank" \
+    --feat_dim=80 \
+    --delta_delta=false \
+    --sample_rate=16000 \
+    --stride_ms=10.0 \
+    --window_ms=25.0 \
+    --use_dB_normalization=False \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for set in train dev test dev-clean dev-other test-clean test-other; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_data.py \
+        --feat_type "raw" \
+        --cmvn_path "data/mean_std.json" \
+        --unit_type "spm" \
+        --spm_model_prefix ${bpeprefix} \
+        --vocab_path="data/vocab.txt" \
+        --manifest_path="data/manifest.${set}.raw" \
+        --output_path="data/manifest.${set}"
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    }&
+    done
+    wait
+fi
+echo "LibriSpeech Data preparation done."
+exit 0
--- a/examples/librispeech/s2/local/download_lm_en.sh
+++ b/examples/librispeech/s2/local/download_lm_en.sh
+#!/bin/bash
+. ${MAIN_ROOT}/utils/utility.sh
+DIR=data/lm
+mkdir -p ${DIR}
+URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm
+MD5="099a601759d467cd0a8523ff939819c5"
+TARGET=${DIR}/common_crawl_00.prune01111.trie.klm
+echo "Download language model ..."
+download $URL $MD5 $TARGET
+if [ $? -ne 0 ]; then
+    echo "Fail to download the language model!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/s2/local/export.sh
+++ b/examples/librispeech/s2/local/export.sh
+#!/bin/bash
+if [ $# != 3 ];then
+    echo "usage: $0 config_path ckpt_prefix jit_model_path"
+    exit -1
+fi
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+config_path=$1
+ckpt_path_prefix=$2
+jit_model_export_path=$3
+device=gpu
+if [ ${ngpu} == 0 ];then
+    device=cpu
+fi
+python3 -u ${BIN_DIR}/export.py \
+--device ${device} \
+--nproc ${ngpu} \
+--config ${config_path} \
+--checkpoint_path ${ckpt_path_prefix} \
+--export_path ${jit_model_export_path}
+if [ $? -ne 0 ]; then
+    echo "Failed in export!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
+#!/bin/bash
+if [ $# != 2 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix"
+    exit -1
+fi
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+device=gpu
+if [ ${ngpu} == 0 ];then
+    device=cpu
+fi
+config_path=$1
+ckpt_prefix=$2
+chunk_mode=false
+if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
+    chunk_mode=true
+fi
+echo "chunk mode ${chunk_mode}"
+# download language model
+#bash local/download_lm_en.sh
+#if [ $? -ne 0 ]; then
+#    exit 1
+#fi
+for type in attention ctc_greedy_search; do
+    echo "decoding ${type}"
+    if [ ${chunk_mode} == true ];then
+        # stream decoding only support batchsize=1
+        batch_size=1
+    else
+        batch_size=64
+    fi
+    python3 -u ${BIN_DIR}/test.py \
+    --device ${device} \
+    --nproc 1 \
+    --config ${config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+for type in ctc_prefix_beam_search attention_rescoring; do
+    echo "decoding ${type}"
+    batch_size=1
+    python3 -u ${BIN_DIR}/test.py \
+    --device ${device} \
+    --nproc 1 \
+    --config ${config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
--- a/examples/librispeech/s2/local/train.sh
+++ b/examples/librispeech/s2/local/train.sh
+#!/bin/bash
+if [ $# != 2 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    exit -1
+fi
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+config_path=$1
+ckpt_name=$2
+device=gpu
+if [ ${ngpu} == 0 ];then
+    device=cpu
+fi
+echo "using ${device}..."
+mkdir -p exp
+python3 -u ${BIN_DIR}/train.py \
+--device ${device} \
+--nproc ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name}
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+exit 0
--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH}
+export LC_ALL=C
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+MODEL=u2
+export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
+#!/bin/bash
+set -e
+source path.sh
+stage=0
+stop_stage=100
+conf_path=conf/transformer.yaml
+avg_num=30
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh || exit -1
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path}  ${ckpt}
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    avg.sh exp/${ckpt}/checkpoints ${avg_num}
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # ctc alignment of test data
+    CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    # export ckpt avg_n
+    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi
--- a/examples/librispeech/s2/utils
+++ b/examples/librispeech/s2/utils
+../../../utils/
\ No newline at end of file
--- a/examples/ngram_lm/s0/path.sh
+++ b/examples/ngram_lm/s0/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/spm/path.sh
+++ b/examples/spm/path.sh
-export MAIN_ROOT=${PWD}/../../
+export MAIN_ROOT=`realpath ${PWD}/../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/ted_en_zh/README.md
+++ b/examples/ted_en_zh/README.md
+# TED En -> Zh
+* t0 for u2 speech translation
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train.tiny
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.05  # second
+  max_input_len: 30.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.01
+  max_output_input_ratio: 20.0
+collator:
+  vocab_filepath: data/vocab.txt
+  unit_type: 'spm'
+  spm_model_prefix: data/bpe_unigram_8000
+  mean_std_filepath: ""
+  # augmentation_config: conf/augmentation.json
+  batch_size: 10
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+    # hybrid CTC/attention
+    model_conf:
+        asr_weight: 0.0
+        ctc_weight: 0.0
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+training:
+  n_epoch: 120
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 0.004
+    weight_decay: 1e-06
+  scheduler: warmuplr     # pytorch v1.1.0+ required
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 5
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+decoding:
+  batch_size: 5
+  error_rate_type: char-bleu
+  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+# https://yaml.org/type/float.html
+data:
+  train_manifest: data/manifest.train
+  dev_manifest: data/manifest.dev
+  test_manifest: data/manifest.test
+  min_input_len: 0.05  # second
+  max_input_len: 30.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.01
+  max_output_input_ratio: 20.0
+collator:
+  vocab_filepath: data/vocab.txt
+  unit_type: 'spm'
+  spm_model_prefix: data/bpe_unigram_8000
+  mean_std_filepath: ""
+  # augmentation_config: conf/augmentation.json
+  batch_size: 10
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
+  feat_dim: 80
+  delta_delta: False
+  dither: 1.0
+  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
+  use_dB_normalization: True
+  target_dB: -20
+  random_seed: 0
+  keep_transcription_text: False
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 2
+# network architecture
+model:
+    cmvn_file: "data/mean_std.json"
+    cmvn_file_type: "json"
+    # encoder related
+    encoder: transformer
+    encoder_conf:
+        output_size: 256    # dimension of attention
+        attention_heads: 4
+        linear_units: 2048  # the number of units of position-wise feed forward
+        num_blocks: 12      # the number of encoder blocks
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        attention_dropout_rate: 0.0
+        input_layer: conv2d # encoder input type, you can chose conv2d, conv2d6 and conv2d8
+        normalize_before: true
+    # decoder related
+    decoder: transformer
+    decoder_conf:
+        attention_heads: 4
+        linear_units: 2048
+        num_blocks: 6
+        dropout_rate: 0.1
+        positional_dropout_rate: 0.1
+        self_attention_dropout_rate: 0.0
+        src_attention_dropout_rate: 0.0
+    # hybrid CTC/attention
+    model_conf:
+        asr_weight: 0.5
+        ctc_weight: 0.3
+        lsm_weight: 0.1     # label smoothing option
+        length_normalized_loss: false
+training:
+  n_epoch: 120
+  accum_grad: 2
+  global_grad_clip: 5.0
+  optim: adam
+  optim_conf:
+    lr: 2.5
+    weight_decay: 1e-06
+  scheduler: noam    
+  scheduler_conf:
+    warmup_steps: 25000
+    lr_decay: 1.0
+  log_interval: 5
+  checkpoint:
+    kbest_n: 50
+    latest_n: 5
+decoding:
+  batch_size: 5
+  error_rate_type: char-bleu
+  decoding_method: fullsentence  # 'fullsentence', 'simultaneous'
+  alpha: 2.5
+  beta: 0.3
+  beam_size: 10
+  cutoff_prob: 1.0
+  cutoff_top_n: 0
+  num_proc_bsearch: 8
+  ctc_weight: 0.5 # ctc weight for attention rescoring decode mode.
+  decoding_chunk_size: -1 # decoding chunk size. Defaults to -1.
+      # <0: for decoding, use full chunk.
+      # >0: for decoding, use fixed chunk size as set.
+      # 0: used for training, it's prohibited here. 
+  num_decoding_left_chunks: -1  # number of left chunks for decoding. Defaults to -1.
+  simulate_streaming: False  # simulate streaming inference. Defaults to False.
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
+#!/bin/bash
+stage=-1
+stop_stage=100
+# bpemode (unigram or bpe)
+nbpe=8000
+bpemode=unigram
+bpeprefix="data/bpe_${bpemode}_${nbpe}"
+DATA_DIR= 
+source ${MAIN_ROOT}/utils/parse_options.sh
+mkdir -p data
+TARGET_DIR=${MAIN_ROOT}/examples/dataset
+mkdir -p ${TARGET_DIR}
+if [ ! -d ${SOURCE_DIR} ]; then
+    echo "Error: Dataset is not avaiable. Please download and unzip the dataset"
+    echo "Download Link: https://pan.baidu.com/s/18L-59wgeS96WkObISrytQQ Passwd: bva0"
+    echo "The tree of the directory should be:"
+    echo "."
+    echo "|-- En-Zh"
+    echo "|-- test-segment"
+    echo "    |-- tst2010"
+    echo "    |-- ..."
+    echo "|-- train-split"
+    echo "    |-- train-segment"
+    echo "|-- README.md"
+    exit 1
+fi
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    # generate manifests
+    python3 ${TARGET_DIR}/ted_en_zh/ted_en_zh.py \
+    --manifest_prefix="data/manifest" \
+    --src_dir="${DATA_DIR}" 
+    echo "Complete raw data pre-process."
+fi
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # build vocabulary
+    python3 ${MAIN_ROOT}/utils/build_vocab.py \
+    --unit_type "spm" \
+    --spm_vocab_size=${nbpe} \
+    --spm_mode ${bpemode} \
+    --spm_model_prefix ${bpeprefix} \
+    --vocab_path="data/vocab.txt" \
+    --text_keys 'text' 'text1' \
+    --manifest_paths="data/manifest.train.raw"
+    if [ $? -ne 0 ]; then
+        echo "Build vocabulary failed. Terminated."
+        exit 1
+    fi
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # compute mean and stddev for normalizer
+    num_workers=$(nproc)
+    python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
+    --manifest_path="data/manifest.train.raw" \
+    --num_samples=-1 \
+    --specgram_type="fbank" \
+    --feat_dim=80 \
+    --delta_delta=false \
+    --sample_rate=16000 \
+    --stride_ms=10.0 \
+    --window_ms=25.0 \
+    --use_dB_normalization=False \
+    --num_workers=${num_workers} \
+    --output_path="data/mean_std.json"
+    if [ $? -ne 0 ]; then
+        echo "Compute mean and stddev failed. Terminated."
+        exit 1
+    fi
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # format manifest with tokenids, vocab size
+    for set in train dev test; do
+    {
+        python3 ${MAIN_ROOT}/utils/format_triplet_data.py \
+        --feat_type "raw" \
+        --cmvn_path "data/mean_std.json" \
+        --unit_type "spm" \
+        --spm_model_prefix ${bpeprefix} \
+        --vocab_path="data/vocab.txt" \
+        --manifest_path="data/manifest.${set}.raw" \
+        --output_path="data/manifest.${set}"
+        if [ $? -ne 0 ]; then
+            echo "Formt mnaifest failed. Terminated."
+            exit 1
+        fi
+    }&
+    done
+    wait
+fi
+echo "Ted En-Zh Data preparation done."
+exit 0
--- a/examples/ted_en_zh/t0/local/test.sh
+++ b/examples/ted_en_zh/t0/local/test.sh
+#! /usr/bin/env bash
+if [ $# != 2 ];then
+    echo "usage: ${0} config_path ckpt_path_prefix"
+    exit -1
+fi
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+device=gpu
+if [ ${ngpu} == 0 ];then
+    device=cpu
+fi
+config_path=$1
+ckpt_prefix=$2
+for type in fullsentence; do
+    echo "decoding ${type}"
+    batch_size=32
+    python3 -u ${BIN_DIR}/test.py \
+    --device ${device} \
+    --nproc 1 \
+    --config ${config_path} \
+    --result_file ${ckpt_prefix}.${type}.rsl \
+    --checkpoint_path ${ckpt_prefix} \
+    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+    if [ $? -ne 0 ]; then
+        echo "Failed in evaluation!"
+        exit 1
+    fi
+done
+exit 0
--- a/examples/ted_en_zh/t0/local/train.sh
+++ b/examples/ted_en_zh/t0/local/train.sh
+#!/bin/bash
+if [ $# != 2 ];then
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    exit -1
+fi
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+config_path=$1
+ckpt_name=$2
+device=gpu
+if [ ${ngpu} == 0 ];then
+    device=cpu
+fi
+echo "using ${device}..."
+mkdir -p exp
+python3 -u ${BIN_DIR}/train.py \
+--device ${device} \
+--nproc ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name}
+if [ $? -ne 0 ]; then
+    echo "Failed in training!"
+    exit 1
+fi
+exit 0
--- a/examples/ted_en_zh/t0/path.sh
+++ b/examples/ted_en_zh/t0/path.sh
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8 
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+MODEL=u2_st
+export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
+#!/bin/bash
+set -e
+source path.sh
+stage=0
+stop_stage=100
+conf_path=conf/transformer_joint_noam.yaml
+avg_num=5
+data_path=./TED-En-Zh # path to unzipped data
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+avg_ckpt=avg_${avg_num}
+ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')
+echo "checkpoint name ${ckpt}"
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # prepare data
+    bash ./local/data.sh --DATA_DIR ${data_path} || exit -1
+fi
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # train model, all `ckpt` under `exp` dir
+    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path}  ${ckpt}
+fi
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # avg n best model
+    ../../utils/avg.sh exp/${ckpt}/checkpoints ${avg_num}
+fi
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # test ckpt avg_n
+    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1
+fi
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    # export ckpt avg_n
+    CUDA_VISIBLE_DEVICES= ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit
+fi
--- a/examples/text_normalization/path.sh
+++ b/examples/text_normalization/path.sh
-export MAIN_ROOT=${PWD}/../../
+export MAIN_ROOT=`realpath ${PWD}/../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/thchs30/a0/path.sh
+++ b/examples/thchs30/a0/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/timit/s1/path.sh
+++ b/examples/timit/s1/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -32,7 +32,7 @@ collator:
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2
  batch_size: 4
 model:

--- a/examples/tiny/s0/path.sh
+++ b/examples/tiny/s0/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -3,18 +3,20 @@ data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
  test_manifest: data/manifest.tiny
+  min_input_len: 0.5  # second
+  max_input_len: 20.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+collator:
+  mean_std_filepath: ""
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  batch_size: 4
-  min_input_len: 0.5
-  max_input_len: 20.0
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80

--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -3,18 +3,20 @@ data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
  test_manifest: data/manifest.tiny
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
-  batch_size: 4
  min_input_len: 0.5  # second
  max_input_len: 20.0 # second
  min_output_len: 0.0 # tokens
  max_output_len: 400.0 # tokens
  min_output_input_ratio: 0.05
  max_output_input_ratio: 10.0
+collator:
+  mean_std_filepath: ""
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_200'
+  augmentation_config: conf/augmentation.json
+  batch_size: 4
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80

--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -3,18 +3,20 @@ data:
  train_manifest: data/manifest.tiny
  dev_manifest: data/manifest.tiny
  test_manifest: data/manifest.tiny
+  min_input_len: 0.5  # second
+  max_input_len: 20.0 # second
+  min_output_len: 0.0 # tokens
+  max_output_len: 400.0 # tokens
+  min_output_input_ratio: 0.05
+  max_output_input_ratio: 10.0
+collator:
+  mean_std_filepath: ""
  vocab_filepath: data/vocab.txt 
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
  augmentation_config: conf/augmentation.json
  batch_size: 4
-  min_input_len: 0.5
-  max_input_len: 20.0
-  min_output_len: 0.0
-  max_output_len: 400.0
-  min_output_input_ratio: 0.05
-  max_output_input_ratio: 10.0
  raw_wav: True  # use raw_wav or kaldi feature
  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80

--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -11,30 +11,29 @@ data:
  max_output_input_ratio: 10.0
 collator:
-  vocab_filepath: data/vocab.txt 
  mean_std_filepath: ""
-  augmentation_config: conf/augmentation.json
+  vocab_filepath: data/vocab.txt 
-  random_seed: 0
  unit_type: 'spm'
  spm_model_prefix: 'data/bpe_unigram_200'
-  specgram_type: fbank
+  augmentation_config: conf/augmentation.json
+  batch_size: 4
+  raw_wav: True  # use raw_wav or kaldi feature
+  specgram_type: fbank #linear, mfcc, fbank
  feat_dim: 80
  delta_delta: False
-  stride_ms: 10.0
+  dither: 1.0
-  window_ms: 20.0
-  n_fft: None
-  max_freq: None
  target_sample_rate: 16000
+  max_freq: None
+  n_fft: None
+  stride_ms: 10.0
+  window_ms: 25.0
  use_dB_normalization: True
  target_dB: -20
-  dither: 1.0
+  random_seed: 0
  keep_transcription_text: False
-  batch_size: 4
  sortagrad: True 
  shuffle_method: batch_shuffle
-  num_workers: 0 #2
+  num_workers: 2
-  raw_wav: True  # use raw_wav or kaldi feature
 # network architecture
 model:

--- a/examples/tiny/s1/path.sh
+++ b/examples/tiny/s1/path.sh
-export MAIN_ROOT=${PWD}/../../../
+export MAIN_ROOT=`realpath ${PWD}/../../../`
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

--- a/requirements.txt
+++ b/requirements.txt
 coverage
+gpustat
 pre-commit
 pybind11
 resampy==0.2.2
+sacrebleu
 scipy==1.2.1
 sentencepiece
 snakeviz

--- a/setup.sh
+++ b/setup.sh
@@ -9,14 +9,21 @@ if [ $(id -u) -eq 0 ]; then
 fi
 if [ -e /etc/lsb-release ];then
-    #${SUDO} apt-get update
+    ${SUDO} apt-get update -y
-    ${SUDO} apt-get install -y vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
+    ${SUDO} apt-get install -y jq vim tig tree sox pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev
    if [ $? != 0 ]; then
        error_msg "Please using Ubuntu or install pkg-config libflac-dev libogg-dev libvorbis-dev libboost-dev swig python3-dev by user."
        exit -1
    fi
 fi
+# tools/make
+rm tools/*.done
+pushd tools && make && popd
+source tools/venv/bin/activate
 # install python dependencies
 if [ -f "requirements.txt" ]; then
    pip3 install -r requirements.txt
@@ -54,10 +61,10 @@ if [ $? != 0 ]; then
    fi
    cd AutoLog
    pip install -r requirements.txt
-    python setup.py install 
+    python setup.py install
    cd ..
    rm -rf AutoLog
-fi 
+fi
 # install decoders
 python3 -c "import pkg_resources; pkg_resources.require(\"swig_decoders==1.1\")"

--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -44,6 +44,11 @@ add_arg('manifest_paths', str,
        "You can provide multiple manifest files.",
        nargs='+',
        required=True)
+add_arg('text_keys', str,
+        'text',
+        "keys of the text in manifest for building vocabulary. "
+        "You can provide multiple k.",
+        nargs='+')
 # bpe
 add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
 add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
@@ -58,10 +63,10 @@ def count_manifest(counter, text_feature, manifest_path):
        line = text_feature.tokenize(line_json['text'])
        counter.update(line)
-def dump_text_manifest(fileobj, manifest_path):
+def dump_text_manifest(fileobj, manifest_path, key='text'):
    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
-        fileobj.write(line_json['text'] + "\n")
+        fileobj.write(line_json[key] + "\n")
 def main():
    print_arguments(args, globals())
@@ -78,7 +83,9 @@ def main():
        fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
        for manifest_path in args.manifest_paths:
-            dump_text_manifest(fp, manifest_path)
+            text_keys = [args.text_keys] if type(args.text_keys) is not list else args.text_keys
+            for text_key in text_keys:
+                dump_text_manifest(fp, manifest_path, key=text_key)
        fp.close()
        # train
        spm.SentencePieceTrainer.Train(

--- a/utils/duration_from_maniefst.sh
+++ b/utils/duration_from_maniefst.sh
+#!/bin/bash
+if [ $# == 1 ];then
+    echo "usage: ${0} manifest_file"
+    exit -1
+fi
+manifest=$1
+jq -S '.feat_shape[0]' ${manifest} | sort -nu
--- a/utils/format_triplet_data.py
+++ b/utils/format_triplet_data.py
+#!/usr/bin/env python3
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""format manifest with more metadata."""
+import argparse
+import functools
+import json
+from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
+from deepspeech.frontend.utility import load_cmvn
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.utils.utility import add_arguments
+from deepspeech.utils.utility import print_arguments
+parser = argparse.ArgumentParser(description=__doc__)
+add_arg = functools.partial(add_arguments, argparser=parser)
+# yapf: disable
+add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kaldi")
+add_arg('cmvn_path',       str,
+        'examples/librispeech/data/mean_std.json',
+        "Filepath of cmvn.")
+add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
+add_arg('vocab_path',       str,
+        'examples/librispeech/data/vocab.txt',
+        "Filepath of the vocabulary.")
+add_arg('manifest_paths',   str,
+        None,
+        "Filepaths of manifests for building vocabulary. "
+        "You can provide multiple manifest files.",
+        nargs='+',
+        required=True)
+# bpe
+add_arg('spm_model_prefix', str, None,
+     "spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
+add_arg('output_path',  str, None, "filepath of formated manifest.", required=True)
+# yapf: disable
+args = parser.parse_args()
+def main():
+    print_arguments(args, globals())
+    fout = open(args.output_path, 'w', encoding='utf-8')
+    # get feat dim
+    mean, std = load_cmvn(args.cmvn_path, filetype='json')
+    feat_dim = mean.shape[0] #(D)
+    print(f"Feature dim: {feat_dim}")
+    text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
+    vocab_size = text_feature.vocab_size
+    print(f"Vocab size: {vocab_size}")
+    count = 0
+    for manifest_path in args.manifest_paths:
+        manifest_jsons = read_manifest(manifest_path)
+        for line_json in manifest_jsons:
+            # text: translation text, text1: transcript text.
+            # Currently only support joint-vocab, will add separate vocabs setting.
+            line = line_json['text']
+            tokens = text_feature.tokenize(line)
+            tokenids = text_feature.featurize(line)
+            line_json['token'] = tokens
+            line_json['token_id'] = tokenids
+            line_json['token_shape'] = (len(tokenids), vocab_size)
+            line = line_json['text1']
+            tokens = text_feature.tokenize(line)
+            tokenids = text_feature.featurize(line)
+            line_json['token1'] = tokens
+            line_json['token_id1'] = tokenids
+            line_json['token_shape1'] = (len(tokenids), vocab_size)
+            feat_shape = line_json['feat_shape']
+            assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
+            if args.feat_type == 'raw':
+                feat_shape.append(feat_dim)
+            else: # kaldi
+                raise NotImplementedError('no support kaldi feat now!')
+            fout.write(json.dumps(line_json) + '\n')
+            count += 1
+    print(f"Examples number: {count}")
+    fout.close()
+if __name__ == '__main__':
+    main()