merge the change

285e0c9c · huangyuxin · 264bba76 · 5ed56b3f · 285e0c9c · 285e0c9c
115 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -18,5 +18,7 @@ tools/sox-14.4.2
 tools/soxbindings
 tools/montreal-forced-aligner/
 tools/Montreal-Forced-Aligner/
+tools/sctk
+tools/sctk-20159b5/
 *output/
--- a/deepspeech/exps/deepspeech2/bin/train.py
+++ b/deepspeech/exps/deepspeech2/bin/train.py
@@ -27,7 +27,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)

--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -19,6 +19,7 @@ from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
+import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@@ -305,9 +306,10 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
+                fout.write({"utt": utt, "ref": target, "hyp": result})
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
+            logger.info(f"Utt: {utt}")
-                        (target, result))
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
            logger.info("Current error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))
@@ -350,7 +352,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
        cfg = self.config
        error_rate_type = None
        errors_sum, len_refs, num_ins = 0.0, 0, 0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                utts, audio, audio_len, texts, texts_len = batch
                metrics = self.compute_metrics(utts, audio, audio_len, texts,
@@ -403,7 +405,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()
@@ -635,7 +637,7 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()

--- a/deepspeech/exps/u2/bin/train.py
+++ b/deepspeech/exps/u2/bin/train.py
@@ -32,7 +32,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)

--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -22,6 +22,7 @@ from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
+import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@@ -466,9 +467,10 @@ class U2Tester(U2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
+                fout.write({"utt": utt, "ref": target, "hyp": result})
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
+            logger.info(f"Utt: {utt}")
-                        (target, result))
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
            logger.info("One example error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))
@@ -493,7 +495,7 @@ class U2Tester(U2Trainer):
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_metrics(*batch, fout=fout)
                num_frames += metrics['num_frames']
@@ -653,7 +655,7 @@ class U2Tester(U2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()

--- a/deepspeech/exps/u2_kaldi/bin/train.py
+++ b/deepspeech/exps/u2_kaldi/bin/train.py
@@ -36,7 +36,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)

--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
@@ -21,6 +21,7 @@ from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
+import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@@ -445,9 +446,10 @@ class U2Tester(U2Trainer):
            len_refs += len_ref
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
+                fout.write({"utt": utt, "ref": target, "hyp": result})
-            logger.info("\nTarget Transcription: %s\nOutput Transcription: %s" %
+            logger.info(f"Utt: {utt}")
-                        (target, result))
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
            logger.info("One example error rate [%s] = %f" %
                        (cfg.error_rate_type, error_rate_func(target, result)))
@@ -472,7 +474,7 @@ class U2Tester(U2Trainer):
        errors_sum, len_refs, num_ins = 0.0, 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_metrics(*batch, fout=fout)
                num_frames += metrics['num_frames']
@@ -637,7 +639,7 @@ class U2Tester(U2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()

--- a/deepspeech/exps/u2_st/bin/train.py
+++ b/deepspeech/exps/u2_st/bin/train.py
@@ -30,7 +30,7 @@ def main_sp(config, args):
 def main(config, args):
-    if args.device == "gpu" and args.nprocs > 1:
+    if args.nprocs > 0:
        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    else:
        main_sp(config, args)

--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -21,6 +21,7 @@ from contextlib import nullcontext
 from pathlib import Path
 from typing import Optional
+import jsonlines
 import numpy as np
 import paddle
 from paddle import distributed as dist
@@ -479,8 +480,10 @@ class U2STTester(U2STTrainer):
            len_refs += len(target.split())
            num_ins += 1
            if fout:
-                fout.write(utt + " " + result + "\n")
+                fout.write({"utt": utt, "ref": target, "hyp": result})
-            logger.info("\nReference: %s\nHypothesis: %s" % (target, result))
+            logger.info(f"Utt: {utt}")
+            logger.info(f"Ref: {target}")
+            logger.info(f"Hyp: {result}")
            logger.info("One example BLEU = %s" %
                        (bleu_func([result], [[target]]).prec_str))
@@ -508,7 +511,7 @@ class U2STTester(U2STTrainer):
        len_refs, num_ins = 0, 0
        num_frames = 0.0
        num_time = 0.0
-        with open(self.args.result_file, 'w') as fout:
+        with jsonlines.open(self.args.result_file, 'w') as fout:
            for i, batch in enumerate(self.test_loader):
                metrics = self.compute_translation_metrics(
                    *batch, bleu_func=bleu_func, fout=fout)
@@ -661,7 +664,7 @@ class U2STTester(U2STTrainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()

--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -12,13 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains data helper functions."""
-import codecs
 import json
 import math
 from typing import List
 from typing import Optional
 from typing import Text
+import jsonlines
 import numpy as np
 from deepspeech.utils.log import Log
@@ -92,26 +92,22 @@ def read_manifest(
    """
    manifest = []
-    for json_line in codecs.open(manifest_path, 'r', 'utf-8'):
+    with jsonlines.open(manifest_path, 'r') as reader:
-        try:
+        for json_data in reader:
-            json_data = json.loads(json_line)
+            feat_len = json_data["feat_shape"][
-        except Exception as e:
+                0] if 'feat_shape' in json_data else 1.0
-            raise IOError("Error reading manifest: %s" % str(e))
+            token_len = json_data["token_shape"][
+                0] if 'token_shape' in json_data else 1.0
-        feat_len = json_data["feat_shape"][
+            conditions = [
-            0] if 'feat_shape' in json_data else 1.0
+                feat_len >= min_input_len,
-        token_len = json_data["token_shape"][
+                feat_len <= max_input_len,
-            0] if 'token_shape' in json_data else 1.0
+                token_len >= min_output_len,
-        conditions = [
+                token_len <= max_output_len,
-            feat_len >= min_input_len,
+                token_len / feat_len >= min_output_input_ratio,
-            feat_len <= max_input_len,
+                token_len / feat_len <= max_output_input_ratio,
-            token_len >= min_output_len,
+            ]
-            token_len <= max_output_len,
+            if all(conditions):
-            token_len / feat_len >= min_output_input_ratio,
+                manifest.append(json_data)
-            token_len / feat_len <= max_output_input_ratio,
-        ]
-        if all(conditions):
-            manifest.append(json_data)
    return manifest

--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@@ -14,6 +14,20 @@
 import argparse
+class ExtendAction(argparse.Action):
+    """
+    [Since Python 3.8, the "extend" is available directly in stdlib]
+    (https://docs.python.org/3.8/library/argparse.html#action).
+    If you only have to support 3.8+ then defining it yourself is no longer required. 
+    Usage of stdlib "extend" action is exactly the same way as this answer originally described:
+    """
+    def __call__(self, parser, namespace, values, option_string=None):
+        items = getattr(namespace, self.dest) or []
+        items.extend(values)
+        setattr(namespace, self.dest, items)
 def default_argument_parser():
    r"""A simple yet genral argument parser for experiments with parakeet.
@@ -30,7 +44,7 @@ def default_argument_parser():
    The ``--checkpoint_path`` specifies the checkpoint to load from.
-    The ``--device`` and ``--nprocs`` specifies how to run the training.
+    The ``--nprocs`` specifies how to run the training.
    See Also
@@ -42,6 +56,7 @@ def default_argument_parser():
        the parser
    """
    parser = argparse.ArgumentParser()
+    parser.register('action', 'extend', ExtendAction)
    train_group = parser.add_argument_group(
        title='Train Options', description=None)
@@ -51,12 +66,6 @@ def default_argument_parser():
        default=None,
        help="seed to use for paddle, np and random. None or 0 for random, else set seed."
    )
-    train_group.add_argument(
-        "--device",
-        type=str,
-        default='gpu',
-        choices=["cpu", "gpu"],
-        help="device cpu and gpu are supported.")
    train_group.add_argument(
        "--nprocs",
        type=int,
@@ -70,10 +79,10 @@ def default_argument_parser():
        "--checkpoint_path", type=str, help="path to load checkpoint")
    train_group.add_argument(
        "--opts",
-        type=str,
+        action='extend',
-        default=[],
+        nargs=2,
-        nargs='+',
+        metavar=('key', 'val'),
-        help="overwrite --config file, passing in LIST[KEY VALUE] pairs")
+        help="overwrite --config field, passing (KEY VALUE) pairs")
    train_group.add_argument(
        "--dump-config", metavar="FILE", help="dump config to `this` file.")

--- a/deepspeech/training/trainer.py
+++ b/deepspeech/training/trainer.py
@@ -86,7 +86,7 @@ class Trainer():
    >>>     config.merge_from_list(args.opts)
    >>> config.freeze()
    >>>
-    >>> if args.nprocs > 1 and args.device == "gpu":
+    >>> if args.nprocs > 0:
    >>>     dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
    >>> else:
    >>>     main_sp(config, args)
@@ -119,7 +119,7 @@ class Trainer():
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        if self.parallel:
            self.init_parallel()
@@ -139,7 +139,7 @@ class Trainer():
        """A flag indicating whether the experiment should run with
        multiprocessing.
        """
-        return self.args.device == "gpu" and self.args.nprocs > 1
+        return self.args.nprocs > 1
    def init_parallel(self):
        """Init environment for multiprocess training.

--- a/deepspeech/utils/tensor_utils.py
+++ b/deepspeech/utils/tensor_utils.py
@@ -94,9 +94,19 @@ def pad_sequence(sequences: List[paddle.Tensor],
        length = tensor.shape[0]
        # use index notation to prevent duplicate references to the tensor
        if batch_first:
-            out_tensor[i, :length, ...] = tensor
+            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # out_tensor[i, :length, ...] = tensor
+            if length != 0:
+                out_tensor[i, :length, ...] = tensor
+            else:
+                out_tensor[i, length, ...] = tensor
        else:
-            out_tensor[:length, i, ...] = tensor
+            # TODO (Hui Zhang): set_value op not supprot `end==start`
+            # out_tensor[:length, i, ...] = tensor
+            if length != 0:
+                out_tensor[:length, i, ...] = tensor
+            else:
+                out_tensor[length, i, ...] = tensor
    return out_tensor

--- a/examples/v18_to_v2x/.gitignore
+++ b/examples/v18_to_v2x/.gitignore
--- a/examples/1xt2x/README.md
+++ b/examples/1xt2x/README.md
+# 1xt2x
+Convert Deepspeech 1.8 released model to 2.x.
+## Model
+* Deepspeech2x
+## Exp
+* baidu_en8k
+* aishell
+* librispeech
--- a/examples/v18_to_v2x/exp_aishell/.gitignore
+++ b/examples/v18_to_v2x/exp_aishell/.gitignore
--- a/examples/v18_to_v2x/exp_aishell/conf/augmentation.json
+++ b/examples/v18_to_v2x/exp_aishell/conf/augmentation.json
--- a/examples/v18_to_v2x/exp_aishell/conf/deepspeech2.yaml
+++ b/examples/v18_to_v2x/exp_aishell/conf/deepspeech2.yaml
--- a/examples/v18_to_v2x/exp_aishell/local/data.sh
+++ b/examples/v18_to_v2x/exp_aishell/local/data.sh
--- a/examples/v18_to_v2x/exp_aishell/local/download_lm_ch.sh
+++ b/examples/v18_to_v2x/exp_aishell/local/download_lm_ch.sh
--- a/examples/v18_to_v2x/exp_aishell/local/download_model.sh
+++ b/examples/v18_to_v2x/exp_aishell/local/download_model.sh
--- a/examples/v18_to_v2x/exp_aishell/local/test.sh
+++ b/examples/v18_to_v2x/exp_aishell/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
@@ -23,8 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/v18_to_v2x/exp_baidu_en8k/path.sh
+++ b/examples/v18_to_v2x/exp_baidu_en8k/path.sh
--- a/examples/v18_to_v2x/exp_aishell/run.sh
+++ b/examples/v18_to_v2x/exp_aishell/run.sh
--- a/examples/v18_to_v2x/exp_baidu_en8k/.gitignore
+++ b/examples/v18_to_v2x/exp_baidu_en8k/.gitignore
--- a/examples/v18_to_v2x/exp_baidu_en8k/conf/augmentation.json
+++ b/examples/v18_to_v2x/exp_baidu_en8k/conf/augmentation.json
--- a/examples/v18_to_v2x/exp_baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/v18_to_v2x/exp_baidu_en8k/conf/deepspeech2.yaml
--- a/examples/v18_to_v2x/exp_baidu_en8k/local/data.sh
+++ b/examples/v18_to_v2x/exp_baidu_en8k/local/data.sh
--- a/examples/v18_to_v2x/exp_baidu_en8k/local/download_lm_en.sh
+++ b/examples/v18_to_v2x/exp_baidu_en8k/local/download_lm_en.sh
--- a/examples/v18_to_v2x/exp_baidu_en8k/local/download_model.sh
+++ b/examples/v18_to_v2x/exp_baidu_en8k/local/download_model.sh
--- a/examples/v18_to_v2x/exp_librispeech/local/test.sh
+++ b/examples/v18_to_v2x/exp_librispeech/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
@@ -23,8 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/v18_to_v2x/exp_librispeech/path.sh
+++ b/examples/v18_to_v2x/exp_librispeech/path.sh
--- a/examples/v18_to_v2x/exp_baidu_en8k/run.sh
+++ b/examples/v18_to_v2x/exp_baidu_en8k/run.sh
--- a/examples/v18_to_v2x/exp_librispeech/.gitignore
+++ b/examples/v18_to_v2x/exp_librispeech/.gitignore
--- a/examples/v18_to_v2x/exp_librispeech/conf/augmentation.json
+++ b/examples/v18_to_v2x/exp_librispeech/conf/augmentation.json
--- a/examples/v18_to_v2x/exp_librispeech/conf/deepspeech2.yaml
+++ b/examples/v18_to_v2x/exp_librispeech/conf/deepspeech2.yaml
--- a/examples/v18_to_v2x/exp_librispeech/local/data.sh
+++ b/examples/v18_to_v2x/exp_librispeech/local/data.sh
--- a/examples/v18_to_v2x/exp_librispeech/local/download_lm_en.sh
+++ b/examples/v18_to_v2x/exp_librispeech/local/download_lm_en.sh
--- a/examples/v18_to_v2x/exp_librispeech/local/download_model.sh
+++ b/examples/v18_to_v2x/exp_librispeech/local/download_model.sh
--- a/examples/v18_to_v2x/exp_baidu_en8k/local/test.sh
+++ b/examples/v18_to_v2x/exp_baidu_en8k/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
@@ -23,8 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/v18_to_v2x/exp_aishell/path.sh
+++ b/examples/v18_to_v2x/exp_aishell/path.sh
--- a/examples/v18_to_v2x/exp_librispeech/run.sh
+++ b/examples/v18_to_v2x/exp_librispeech/run.sh
--- a/examples/v18_to_v2x/src_deepspeech2x/__init__.py
+++ b/examples/v18_to_v2x/src_deepspeech2x/__init__.py
--- a/examples/v18_to_v2x/src_deepspeech2x/bin/test.py
+++ b/examples/v18_to_v2x/src_deepspeech2x/bin/test.py
--- a/examples/v18_to_v2x/src_deepspeech2x/models/__init__.py
+++ b/examples/v18_to_v2x/src_deepspeech2x/models/__init__.py
--- a/examples/v18_to_v2x/src_deepspeech2x/models/ds2/__init__.py
+++ b/examples/v18_to_v2x/src_deepspeech2x/models/ds2/__init__.py
--- a/examples/v18_to_v2x/src_deepspeech2x/models/ds2/deepspeech2.py
+++ b/examples/v18_to_v2x/src_deepspeech2x/models/ds2/deepspeech2.py
--- a/examples/v18_to_v2x/src_deepspeech2x/models/ds2/rnn.py
+++ b/examples/v18_to_v2x/src_deepspeech2x/models/ds2/rnn.py
--- a/examples/v18_to_v2x/src_deepspeech2x/test_model.py
+++ b/examples/v18_to_v2x/src_deepspeech2x/test_model.py
@@ -401,7 +401,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
    def setup(self):
        """Setup the experiment.
        """
-        paddle.set_device(self.args.device)
+        paddle.set_device('gpu' if self.args.nprocs > 0 else 'cpu')
        self.setup_output_dir()
        self.setup_checkpointer()

--- a/examples/aishell/s0/local/export.sh
+++ b/examples/aishell/s0/local/export.sh
@@ -13,13 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 model_type=$4
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/export.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/aishell/s0/local/test.sh
+++ b/examples/aishell/s0/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
@@ -23,8 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/aishell/s0/local/test_export.sh
+++ b/examples/aishell/s0/local/test_export.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 jit_model_export_path=$2
 model_type=$3
@@ -23,8 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test_export.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \

--- a/examples/aishell/s0/local/train.sh
+++ b/examples/aishell/s0/local/train.sh
@@ -12,11 +12,6 @@ config_path=$1
 ckpt_name=$2
 model_type=$3
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 mkdir -p exp
 # seed may break model convergence
@@ -26,7 +21,6 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/aishell/s1/local/align.sh
+++ b/examples/aishell/s1/local/align.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -22,8 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/aishell/s1/local/export.sh
+++ b/examples/aishell/s1/local/export.sh
@@ -12,13 +12,7 @@ config_path=$1
 ckpt_path_prefix=$2
 jit_model_export_path=$3
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/export.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/aishell/s1/local/test.sh
+++ b/examples/aishell/s1/local/test.sh
@@ -8,11 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -39,8 +34,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu} \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@@ -58,8 +52,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu} \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/aishell/s1/local/train.sh
+++ b/examples/aishell/s1/local/train.sh
@@ -12,11 +12,6 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 if [ ${seed} != 0  ]; then
    export FLAGS_cudnn_deterministic=True
    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
@@ -34,7 +29,6 @@ mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/callcenter/s1/local/align.sh
+++ b/examples/callcenter/s1/local/align.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -20,7 +16,6 @@ ckpt_name=$(basename ${ckpt_prefxi})
 mkdir -p exp
 batch_size=1
 output_dir=${ckpt_prefix}
 mkdir -p ${output_dir}
@@ -28,8 +23,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/callcenter/s1/local/export.sh
+++ b/examples/callcenter/s1/local/export.sh
@@ -12,13 +12,7 @@ config_path=$1
 ckpt_path_prefix=$2
 jit_model_export_path=$3
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/export.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/callcenter/s1/local/test.sh
+++ b/examples/callcenter/s1/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -32,8 +28,7 @@ for type in attention ctc_greedy_search; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu} \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@@ -51,8 +46,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    output_dir=${ckpt_prefix}
    mkdir -p ${output_dir}
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu} \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${output_dir}/${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/callcenter/s1/local/train.sh
+++ b/examples/callcenter/s1/local/train.sh
@@ -11,10 +11,6 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 echo "using ${device}..."
 mkdir -p exp
@@ -26,7 +22,6 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/librispeech/s0/local/export.sh
+++ b/examples/librispeech/s0/local/export.sh
@@ -13,13 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 model_type=$4
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/export.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/librispeech/s0/local/test.sh
+++ b/examples/librispeech/s0/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
@@ -23,8 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/s0/local/train.sh
+++ b/examples/librispeech/s0/local/train.sh
@@ -12,12 +12,6 @@ config_path=$1
 ckpt_name=$2
 model_type=$3
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-echo "using ${device}..."
 mkdir -p exp
 # seed may break model convergence
@@ -27,7 +21,6 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/librispeech/s1/cmd.sh
+++ b/examples/librispeech/s1/cmd.sh
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
--- a/examples/librispeech/s1/local/align.sh
+++ b/examples/librispeech/s1/local/align.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -22,8 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/s1/local/export.sh
+++ b/examples/librispeech/s1/local/export.sh
@@ -12,13 +12,7 @@ config_path=$1
 ckpt_path_prefix=$2
 jit_model_export_path=$3
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/export.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/librispeech/s1/local/test.sh
+++ b/examples/librispeech/s1/local/test.sh
 #!/bin/bash
-if [ $# != 2 ];then
+set -e
-    echo "usage: ${0} config_path ckpt_path_prefix"
+expdir=exp
+datadir=data
+nj=32
+lmtag=
+recog_set="test-clean test-other dev-clean dev-other"
+recog_set="test-clean"
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpemodel=${bpeprefix}.model
+if [ $# != 3 ];then
+    echo "usage: ${0} config_path dict_path ckpt_path_prefix"
    exit -1
 fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
-ckpt_prefix=$2
+dict=$2
+ckpt_prefix=$3
 chunk_mode=false
 if [[ ${config_path} =~ ^.*chunk_.*yaml$ ]];then
@@ -29,44 +42,46 @@ echo "chunk mode ${chunk_mode}"
 #    exit 1
 #fi
-for type in attention ctc_greedy_search; do
+pids=() # initialize pids
-    echo "decoding ${type}"
-    if [ ${chunk_mode} == true ];then
+for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
-        # stream decoding only support batchsize=1
+(
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${dmethd}_$(basename ${config_path%.*})_${lmtag}
+        feat_recog_dir=${datadir}
+        mkdir -p ${expdir}/${decode_dir}
+        mkdir -p ${feat_recog_dir}
+        # split data
+        split_json.sh ${feat_recog_dir}/manifest.${rtask} ${nj}
+        #### use CPU for decoding
+        ngpu=0
+        # set batchsize 0 to disable batch decoding
        batch_size=1
-    else
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
-        batch_size=64
+            python3 -u ${BIN_DIR}/test.py \
-    fi
+            --nproc ${ngpu} \
-    python3 -u ${BIN_DIR}/test.py \
+            --config ${config_path} \
-    --device ${device} \
+            --result_file ${expdir}/${decode_dir}/data.JOB.json \
-    --nproc 1 \
+            --checkpoint_path ${ckpt_prefix} \
-    --config ${config_path} \
+            --opts decoding.decoding_method ${dmethd} \
-    --result_file ${ckpt_prefix}.${type}.rsl \
+            --opts decoding.batch_size ${batch_size} \
-    --checkpoint_path ${ckpt_prefix} \
+            --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
-    if [ $? -ne 0 ]; then
-        echo "Failed in evaluation!"
-        exit 1
-    fi
-done
-for type in ctc_prefix_beam_search attention_rescoring; do
+    ) &
-    echo "decoding ${type}"
+    pids+=($!) # store background pids
-    batch_size=1
+    done
-    python3 -u ${BIN_DIR}/test.py \
+) &
-    --device ${device} \
+pids+=($!) # store background pids
-    --nproc 1 \
-    --config ${config_path} \
-    --result_file ${ckpt_prefix}.${type}.rsl \
-    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
-    if [ $? -ne 0 ]; then
-        echo "Failed in evaluation!"
-        exit 1
-    fi
 done
+i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+[ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+echo "Finished"
 exit 0
--- a/examples/librispeech/s1/local/train.sh
+++ b/examples/librispeech/s1/local/train.sh
@@ -11,12 +11,6 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-echo "using ${device}..."
 mkdir -p exp
 # seed may break model convergence
@@ -25,8 +19,10 @@ if [ ${seed} != 0 ]; then
    export FLAGS_cudnn_deterministic=True
 fi
+# export FLAGS_cudnn_exhaustive_search=true
+# export FLAGS_conv_workspace_size_limit=4000
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
 export MAIN_ROOT=`realpath ${PWD}/../../../`
-export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C

--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
 #!/bin/bash
 set -e
-source path.sh
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
 stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
 avg_num=5
-source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+. ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
 ckpt=$(basename ${conf_path} | awk -F'.' '{print $1}')

--- a/examples/librispeech/s2/cmd.sh
+++ b/examples/librispeech/s2/cmd.sh
+# ====== About run.pl, queue.pl, slurm.pl, and ssh.pl ======
+# Usage: <cmd>.pl [options] JOB=1:<nj> <log> <command...>
+# e.g.
+#   run.pl --mem 4G JOB=1:10 echo.JOB.log echo JOB
+#
+# Options:
+#   --time <time>: Limit the maximum time to execute.
+#   --mem <mem>: Limit the maximum memory usage.
+#   -–max-jobs-run <njob>: Limit the number parallel jobs. This is ignored for non-array jobs.
+#   --num-threads <ngpu>: Specify the number of CPU core.
+#   --gpu <ngpu>: Specify the number of GPU devices.
+#   --config: Change the configuration file from default.
+#
+# "JOB=1:10" is used for "array jobs" and it can control the number of parallel jobs.
+# The left string of "=", i.e. "JOB", is replaced by <N>(Nth job) in the command and the log file name,
+# e.g. "echo JOB" is changed to "echo 3" for the 3rd job and "echo 8" for 8th job respectively.
+# Note that the number must start with a positive number, so you can't use "JOB=0:10" for example.
+#
+# run.pl, queue.pl, slurm.pl, and ssh.pl have unified interface, not depending on its backend.
+# These options are mapping to specific options for each backend and
+# it is configured by "conf/queue.conf" and "conf/slurm.conf" by default.
+# If jobs failed, your configuration might be wrong for your environment.
+#
+#
+# The official documentation for run.pl, queue.pl, slurm.pl, and ssh.pl:
+#   "Parallelization in Kaldi": http://kaldi-asr.org/doc/queue.html
+# =========================================================~
+# Select the backend used by run.sh from "local", "sge", "slurm", or "ssh"
+cmd_backend='local'
+# Local machine, without any Job scheduling system
+if [ "${cmd_backend}" = local ]; then
+    # The other usage
+    export train_cmd="run.pl"
+    # Used for "*_train.py": "--gpu" is appended optionally by run.sh
+    export cuda_cmd="run.pl"
+    # Used for "*_recog.py"
+    export decode_cmd="run.pl"
+# "qsub" (SGE, Torque, PBS, etc.)
+elif [ "${cmd_backend}" = sge ]; then
+    # The default setting is written in conf/queue.conf.
+    # You must change "-q g.q" for the "queue" for your environment.
+    # To know the "queue" names, type "qhost -q"
+    # Note that to use "--gpu *", you have to setup "complex_value" for the system scheduler.
+    export train_cmd="queue.pl"
+    export cuda_cmd="queue.pl"
+    export decode_cmd="queue.pl"
+# "sbatch" (Slurm)
+elif [ "${cmd_backend}" = slurm ]; then
+    # The default setting is written in conf/slurm.conf.
+    # You must change "-p cpu" and "-p gpu" for the "partion" for your environment.
+    # To know the "partion" names, type "sinfo".
+    # You can use "--gpu * " by default for slurm and it is interpreted as "--gres gpu:*"
+    # The devices are allocated exclusively using "${CUDA_VISIBLE_DEVICES}".
+    export train_cmd="slurm.pl"
+    export cuda_cmd="slurm.pl"
+    export decode_cmd="slurm.pl"
+elif [ "${cmd_backend}" = ssh ]; then
+    # You have to create ".queue/machines" to specify the host to execute jobs.
+    # e.g. .queue/machines
+    #   host1
+    #   host2
+    #   host3
+    # Assuming you can login them without any password, i.e. You have to set ssh keys.
+    export train_cmd="ssh.pl"
+    export cuda_cmd="ssh.pl"
+    export decode_cmd="ssh.pl"
+# This is an example of specifying several unique options in the JHU CLSP cluster setup.
+# Users can modify/add their own command options according to their cluster environments.
+elif [ "${cmd_backend}" = jhu ]; then
+    export train_cmd="queue.pl --mem 2G"
+    export cuda_cmd="queue-freegpu.pl --mem 2G --gpu 1 --config conf/gpu.conf"
+    export decode_cmd="queue.pl --mem 4G"
+else
+    echo "$0: Error: Unknown cmd_backend=${cmd_backend}" 1>&2
+    return 1
+fi
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@@ -12,7 +12,7 @@ collator:
  stride_ms: 10.0
  window_ms: 25.0
  sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs 
-  batch_size: 32 
+  batch_size: 30 
  maxlen_in: 512  # if input length  > maxlen-in, batchsize is automatically reduced
  maxlen_out: 150  # if output length > maxlen-out, batchsize is automatically reduced
  minibatches: 0 # for debug
@@ -59,7 +59,7 @@ model:
    model_conf:
        ctc_weight: 0.3
        ctc_dropoutrate: 0.0
-        ctc_grad_norm_type: instance
+        ctc_grad_norm_type: batch
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false
@@ -83,7 +83,7 @@ scheduler_conf:
  lr_decay: 1.0
 decoding:
-  batch_size: 64
+  batch_size: 1
  error_rate_type: wer
  decoding_method: attention  # 'attention', 'ctc_greedy_search', 'ctc_prefix_beam_search', 'attention_rescoring'
  lang_model_path: data/lm/common_crawl_00.prune01111.trie.klm

--- a/examples/librispeech/s2/local/align.sh
+++ b/examples/librispeech/s2/local/align.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 dict_path=$2
 ckpt_prefix=$3
@@ -26,8 +22,7 @@ python3 -u ${BIN_DIR}/test.py \
 --model-name 'u2_kaldi' \
 --run-mode 'align' \
 --dict-path ${dict_path} \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result-file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/librispeech/s2/local/export.sh
+++ b/examples/librispeech/s2/local/export.sh
@@ -12,15 +12,9 @@ config_path=$1
 ckpt_path_prefix=$2
 jit_model_export_path=$3
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/test.py \
 --model-name 'u2_kaldi' \
 --run-mode 'export' \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
 #!/bin/bash
+set -e
+expdir=exp
+datadir=data
+nj=32
+lmtag=
+recog_set="test-clean test-other dev-clean dev-other"
+recog_set="test-clean"
+# bpemode (unigram or bpe)
+nbpe=5000
+bpemode=unigram
+bpeprefix="data/bpe_${bpemode}_${nbpe}"
+bpemodel=${bpeprefix}.model
 if [ $# != 3 ];then
    echo "usage: ${0} config_path dict_path ckpt_path_prefix"
    exit -1
@@ -8,13 +25,8 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
-dict_path=$2
+dict=$2
 ckpt_prefix=$3
 chunk_mode=false
@@ -30,50 +42,49 @@ echo "chunk mode ${chunk_mode}"
 #    exit 1
 #fi
-for type in attention ctc_greedy_search; do
+pids=() # initialize pids
-    echo "decoding ${type}"
-    if [ ${chunk_mode} == true ];then
+for dmethd in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
-        # stream decoding only support batchsize=1
+(
+    for rtask in ${recog_set}; do
+    (
+        decode_dir=decode_${rtask}_${dmethd}_$(basename ${config_path%.*})_${lmtag}
+        feat_recog_dir=${datadir}
+        mkdir -p ${expdir}/${decode_dir}
+        mkdir -p ${feat_recog_dir}
+        # split data
+        split_json.sh ${feat_recog_dir}/manifest.${rtask} ${nj}
+        #### use CPU for decoding
+        ngpu=0
+        # set batchsize 0 to disable batch decoding
        batch_size=1
-    else
+        ${decode_cmd} JOB=1:${nj} ${expdir}/${decode_dir}/log/decode.JOB.log \
-        batch_size=64
+            python3 -u ${BIN_DIR}/test.py \
-    fi
+            --model-name u2_kaldi \
-    python3 -u ${BIN_DIR}/test.py \
+            --run-mode test \
-    --model-name u2_kaldi \
+            --nproc ${ngpu} \
-    --run-mode test \
+            --dict-path ${dict} \
-    --dict-path ${dict_path} \
+            --config ${config_path} \
-    --device ${device} \
+            --checkpoint_path ${ckpt_prefix} \
-    --nproc 1 \
+            --result-file ${expdir}/${decode_dir}/data.JOB.json \
-    --config ${config_path} \
+            --opts decoding.decoding_method ${dmethd} \
-    --result-file ${ckpt_prefix}.${type}.rsl \
+            --opts decoding.batch_size ${batch_size} \
-    --checkpoint_path ${ckpt_prefix} \
+            --opts data.test_manifest ${feat_recog_dir}/split${nj}/JOB/manifest.${rtask}
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
+        score_sclite.sh --bpe ${nbpe} --bpemodel ${bpemodel}.model --wer true ${expdir}/${decode_dir} ${dict}
-    if [ $? -ne 0 ]; then
-        echo "Failed in evaluation!"
-        exit 1
-    fi
-done
-for type in ctc_prefix_beam_search attention_rescoring; do
+    ) &
-    echo "decoding ${type}"
+    pids+=($!) # store background pids
-    batch_size=1
+    done
-    python3 -u ${BIN_DIR}/test.py \
+) &
-    --model-name u2_kaldi \
+pids+=($!) # store background pids
-    --run-mode test \
-    --dict-path ${dict_path} \
-    --device ${device} \
-    --nproc 1 \
-    --config ${config_path} \
-    --result-file ${ckpt_prefix}.${type}.rsl \
-    --checkpoint_path ${ckpt_prefix} \
-    --opts decoding.decoding_method ${type} decoding.batch_size ${batch_size}
-    if [ $? -ne 0 ]; then
-        echo "Failed in evaluation!"
-        exit 1
-    fi
 done
+i=0; for pid in "${pids[@]}"; do wait ${pid} || ((++i)); done
+[ ${i} -gt 0 ] && echo "$0: ${i} background jobs are failed." && false
+echo "Finished"
 exit 0
--- a/examples/librispeech/s2/local/train.sh
+++ b/examples/librispeech/s2/local/train.sh
@@ -11,12 +11,6 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-echo "using ${device}..."
 mkdir -p exp
 # seed may break model convergence
@@ -27,7 +21,6 @@ fi
 python3 -u ${BIN_DIR}/train.py \
 --model-name u2_kaldi \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
 export MAIN_ROOT=`realpath ${PWD}/../../../`
-export PATH=${MAIN_ROOT}:${PWD}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/tools/sctk/bin:${PWD}/utils:${PATH}
 export LC_ALL=C
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C

--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
 #!/bin/bash
 set -e
-source path.sh
+. ./path.sh || exit 1;
+. ./cmd.sh || exit 1;
 stage=0
 stop_stage=100
 conf_path=conf/transformer.yaml
 dict_path=data/train_960_unigram5000_units.txt
-avg_num=5
+avg_num=10
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 avg_ckpt=avg_${avg_num}
@@ -20,12 +22,12 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=0,1,2,3 ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 ./local/train.sh ${conf_path}  ${ckpt}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
    # avg n best model
-    avg.sh best exp/${ckpt}/checkpoints ${avg_num}
+    avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
 fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then

--- a/examples/ted_en_zh/t0/local/test.sh
+++ b/examples/ted_en_zh/t0/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -19,8 +15,7 @@ for type in fullsentence; do
    echo "decoding ${type}"
    batch_size=32
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu} \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/ted_en_zh/t0/local/train.sh
+++ b/examples/ted_en_zh/t0/local/train.sh
@@ -11,12 +11,6 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-echo "using ${device}..."
 mkdir -p exp
 # seed may break model convergence
@@ -26,7 +20,6 @@ if [ ${seed} != 0 ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/timit/s1/local/align.sh
+++ b/examples/timit/s1/local/align.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -22,8 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/timit/s1/local/export.sh
+++ b/examples/timit/s1/local/export.sh
@@ -12,13 +12,7 @@ config_path=$1
 ckpt_path_prefix=$2
 jit_model_export_path=$3
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/export.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/timit/s1/local/test.sh
+++ b/examples/timit/s1/local/test.sh
@@ -8,11 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -37,8 +32,7 @@ for type in attention ctc_greedy_search; do
        batch_size=64
    fi
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu} \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@@ -54,8 +48,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu}  \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/timit/s1/local/train.sh
+++ b/examples/timit/s1/local/train.sh
@@ -11,12 +11,6 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
-echo "using ${device}..."
 mkdir -p exp
 # seed may break model convergence
@@ -26,7 +20,6 @@ if [ ${seed} != 0  ]; then
 fi
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/tiny/s0/local/export.sh
+++ b/examples/tiny/s0/local/export.sh
@@ -13,13 +13,7 @@ ckpt_path_prefix=$2
 jit_model_export_path=$3
 model_type=$4
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/export.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/tiny/s0/local/test.sh
+++ b/examples/tiny/s0/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
 model_type=$3
@@ -23,8 +19,7 @@ if [ $? -ne 0 ]; then
 fi
 python3 -u ${BIN_DIR}/test.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${ckpt_prefix}.rsl \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/tiny/s0/local/train.sh
+++ b/examples/tiny/s0/local/train.sh
@@ -10,17 +10,11 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 if [ ${seed} != 0  ]; then
    export FLAGS_cudnn_deterministic=True
    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi
 if [ $# != 3 ];then
    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name model_type"
    exit -1
@@ -33,7 +27,6 @@ model_type=$3
 mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/tiny/s1/local/align.sh
+++ b/examples/tiny/s1/local/align.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -22,8 +18,7 @@ mkdir -p ${output_dir}
 # align dump in `result_file`
 # .tier, .TextGrid dump in `dir of result_file`
 python3 -u ${BIN_DIR}/alignment.py \
--device ${device} \
+--nproc ${ngpu} \
--nproc 1 \
 --config ${config_path} \
 --result_file ${output_dir}/${type}.align \
 --checkpoint_path ${ckpt_prefix} \

--- a/examples/tiny/s1/local/export.sh
+++ b/examples/tiny/s1/local/export.sh
@@ -12,13 +12,7 @@ config_path=$1
 ckpt_path_prefix=$2
 jit_model_export_path=$3
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 python3 -u ${BIN_DIR}/export.py \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --checkpoint_path ${ckpt_path_prefix} \

--- a/examples/tiny/s1/local/test.sh
+++ b/examples/tiny/s1/local/test.sh
@@ -8,10 +8,6 @@ fi
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 config_path=$1
 ckpt_prefix=$2
@@ -35,8 +31,7 @@ for type in attention ctc_greedy_search; do
        batch_size=64
    fi
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu} \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \
@@ -52,8 +47,7 @@ for type in ctc_prefix_beam_search attention_rescoring; do
    echo "decoding ${type}"
    batch_size=1
    python3 -u ${BIN_DIR}/test.py \
-    --device ${device} \
+    --nproc ${ngpu} \
-    --nproc 1 \
    --config ${config_path} \
    --result_file ${ckpt_prefix}.${type}.rsl \
    --checkpoint_path ${ckpt_prefix} \

--- a/examples/tiny/s1/local/train.sh
+++ b/examples/tiny/s1/local/train.sh
@@ -12,11 +12,6 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 echo "using $ngpu gpus..."
-device=gpu
-if [ ${ngpu} == 0 ];then
-    device=cpu
-fi
 if [ ${seed} != 0  ]; then
    export FLAGS_cudnn_deterministic=True
    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
@@ -34,7 +29,6 @@ mkdir -p exp
 python3 -u ${BIN_DIR}/train.py \
 --seed ${seed} \
--device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/requirements.txt
+++ b/requirements.txt
 coverage
 gpustat
 jsonlines
-jsonlines
 kaldiio
 loguru
 Pillow

--- a/tests/chains/ds2_params_lite_train_infer.txt
+++ b/tests/chains/ds2_params_lite_train_infer.txt
@@ -13,7 +13,7 @@ null:null
 null:null
 ##
 trainer:norm_train
-norm_train: ../../../deepspeech/exps/deepspeech2/bin/train.py --nproc 1 --config conf/deepspeech2.yaml --model_type offline --device gpu
+norm_train: ../../../deepspeech/exps/deepspeech2/bin/train.py --nproc 1 --config conf/deepspeech2.yaml --model_type offline
 pact_train:null
 fpgm_train:null
 distill_train:null
@@ -21,7 +21,7 @@ null:null
 null:null
 ##
 ===========================eval_params===========================
-eval: ../../../deepspeech/exps/deepspeech2/bin/test.py --nproc 1 --config conf/deepspeech2.yaml --result_file tests/9.rsl  --model_type offline --device gpu
+eval: ../../../deepspeech/exps/deepspeech2/bin/test.py --nproc 1 --config conf/deepspeech2.yaml --result_file tests/9.rsl  --model_type offline
 null:null
 ##
 ===========================infer_params===========================

--- a/tools/Makefile
+++ b/tools/Makefile
 SHELL:= /bin/bash
 PYTHON:= python3.7
+CXX ?= g++
+CC ?= gcc        # used for sph2pipe
+# CXX = clang++  # Uncomment these lines...
+# CC = clang     # ...to build with Clang.
+WGET ?= wget
 .PHONY: all clean
-all: virtualenv kenlm.done sox.done soxbindings.done mfa.done
+all: virtualenv kenlm.done sox.done soxbindings.done mfa.done sclite.done
 virtualenv:
 	test -d venv || virtualenv -p $(PYTHON) venv
@@ -39,3 +47,50 @@ mfa.done:
 	test -d montreal-forced-aligner || wget https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner/releases/download/v1.0.1/montreal-forced-aligner_linux.tar.gz
 	tar xvf montreal-forced-aligner_linux.tar.gz
 	touch mfa.done
+#== SCTK ===============================================================================
+# SCTK official repo does not have version tags. Here's the mapping:
+# # 2.4.9 = 659bc36; 2.4.10 = d914e1b; 2.4.11 = 20159b5.
+SCTK_GITHASH = 20159b5
+SCTK_CXFLAGS = -w -march=native
+SCTK_MKENV = CFLAGS="$(CFLAGS) $(SCTK_CXFLAGS)" \
+			              CXXFLAGS="$(CXXFLAGS) -std=c++11 $(SCTK_CXFLAGS)" \
+# Keep the existing target 'sclite' to avoid breaking the users who might have
+# scripted it in.
+.PHONY: sclite.done sctk_cleaned sctk_made
+sclite.done sctk_made: sctk/.compiled
+	touch sclite.done
+sctk/.compiled: sctk
+	rm -f sctk/.compiled
+	$(SCTK_MKENV) $(MAKE) -C sctk config
+	$(SCTK_MKENV) $(MAKE) -C sctk all doc
+	$(MAKE) -C sctk install
+	touch sctk/.compiled
+# The GitHub archive unpacks into SCTK-{40-character-long-hash}/
+sctk: sctk-$(SCTK_GITHASH).tar.gz
+	tar zxvf sctk-$(SCTK_GITHASH).tar.gz
+	rm -rf sctk-$(SCTK_GITHASH) sctk
+	mv SCTK-$(SCTK_GITHASH)* sctk-$(SCTK_GITHASH)
+	ln -s sctk-$(SCTK_GITHASH) sctk
+	touch sctk-$(SCTK_GITHASH).tar.gz
+sctk-$(SCTK_GITHASH).tar.gz:
+	if [ -d '$(DOWNLOAD_DIR)' ]; then \
+	  cp -p '$(DOWNLOAD_DIR)/sctk-$(SCTK_GITHASH).tar.gz' .; \
+	else \
+	  $(WGET) -nv -T 10 -t 3 -O sctk-$(SCTK_GITHASH).tar.gz \
+	    https://github.com/usnistgov/SCTK/archive/$(SCTK_GITHASH).tar.gz; \
+	fi
+sctk_cleaned:
+	-for d in sctk/ sctk-*/; do \
+	   [ ! -f $$d/.compiled ] || $(MAKE) -C $$d clean; \
+	   rm -f $$d/.compiled; \
+	done
--- a/utils/README.md
+++ b/utils/README.md
+# Utils
+* [kaldi utils](https://github.com/kaldi-asr/kaldi/blob/cbed4ff688/egs/wsj/s5/utils)
+* [espnet utils)(https://github.com/espnet/espnet/tree/master/utils)
--- a/utils/avg_model.py
+++ b/utils/avg_model.py
@@ -27,33 +27,33 @@ def main(args):
    val_scores = []
    beat_val_scores = []
    selected_epochs = []
-    if args.val_best:
-        jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
-        for y in jsons:
-            with open(y, 'r') as f:
-                dic_json = json.load(f)
-            loss = dic_json['val_loss']
-            epoch = dic_json['epoch']
-            if epoch >= args.min_epoch and epoch <= args.max_epoch:
-                val_scores.append((epoch, loss))
-        val_scores = np.array(val_scores)
+    jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
+    jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
+    for y in jsons:
+        with open(y, 'r') as f:
+            dic_json = json.load(f)
+        loss = dic_json['val_loss']
+        epoch = dic_json['epoch']
+        if epoch >= args.min_epoch and epoch <= args.max_epoch:
+            val_scores.append((epoch, loss))
+    val_scores = np.array(val_scores)
+    if args.val_best:
        sort_idx = np.argsort(val_scores[:, 1])
        sorted_val_scores = val_scores[sort_idx]
-        path_list = [
-            args.ckpt_dir + '/{}.pdparams'.format(int(epoch))
-            for epoch in sorted_val_scores[:args.num, 0]
-        ]
-        beat_val_scores = sorted_val_scores[:args.num, 1]
-        selected_epochs = sorted_val_scores[:args.num, 0].astype(np.int64)
-        print("best val scores = " + str(beat_val_scores))
-        print("selected epochs = " + str(selected_epochs))
    else:
-        path_list = glob.glob(f'{args.ckpt_dir}/[!avg][!final]*.pdparams')
+        sorted_val_scores = val_scores
-        path_list = sorted(path_list, key=os.path.getmtime)
-        path_list = path_list[-args.num:]
+    beat_val_scores = sorted_val_scores[:args.num, 1]
+    selected_epochs = sorted_val_scores[:args.num, 0].astype(np.int64)
+    print("selected val scores = " + str(beat_val_scores))
+    print("selected epochs = " + str(selected_epochs))
+    path_list = [
+        args.ckpt_dir + '/{}.pdparams'.format(int(epoch))
+        for epoch in sorted_val_scores[:args.num, 0]
+    ]
    print(path_list)
    avg = None
@@ -78,6 +78,7 @@ def main(args):
    meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
    with open(meta_path, 'w') as f:
        data = json.dumps({
+            "mode": 'val_best' if args.val_best else 'latest',
            "avg_ckpt": args.dst_model,
            "ckpt": path_list,
            "epoch": selected_epochs.tolist(),

--- a/utils/build_kenlm_model_from_arpa.sh
+++ b/utils/build_kenlm_model_from_arpa.sh
+#!/usr/bin/env bash
+# 2020 author Jiayu DU
+# Apache 2.0
+# This script reads in an Arpa format language model, and converts it into the
+# KenLM format language model.
+[ -f path.sh ] && . ./path.sh;
+# begin configuration section
+kenlm_opts="" # e.g. "-q 8 -b 8" for 8bits quantization
+model_type="trie" # "trie" or "probing". trie is smaller, probing is faster.
+# end configuration section
+. utils/parse_options.sh
+if [ $# != 2 ]; then
+  echo "Usage: "
+  echo "  $0 [options] <arpa-lm-path> <kenlm-path>"
+  echo "e.g.:"
+  echo "  $0 data/local/lm/4gram.arpa data/lang_test/G.trie"
+  echo "Options:"
+  echo "  --model-type can be either \"trie\" or \"probing\""
+  echo "  --kenlm-opts directly pass through to kenlm"
+  echo "    e.g. for 8bits quantization, feed \"-q 8 -b 8\""
+  exit 1;
+fi
+export LC_ALL=C
+arpa_lm=$1
+kenlm=$2
+if ! which build_binary >& /dev/null ; then
+  echo "$0: cannot find KenLM's build_binary tool,"
+  echo "check kenlm installation (tools/extras/install_kenlm_query_only.sh)."
+  exit 1
+fi
+mkdir -p $(dirname $kenlm)
+build_binary  $kenlm_opts  $model_type  $arpa_lm  $kenlm
+echo "$0: Successfully built arpa into kenlm format: $kenlm"
+exit 0
\ No newline at end of file
--- a/utils/dump_manifest.py
+++ b/utils/dump_manifest.py
--- a/utils/duration_from_maniefst.sh
+++ b/utils/duration_from_maniefst.sh
--- a/utils/filter.py
+++ b/utils/filter.py
--- a/utils/log.sh
+++ b/utils/log.sh
--- a/utils/parallel/run.pl
+++ b/utils/parallel/run.pl
--- a/utils/parse_options.sh
+++ b/utils/parse_options.sh
--- a/utils/pd_env_collect.sh
+++ b/utils/pd_env_collect.sh
--- a/utils/profile.sh
+++ b/utils/profile.sh
--- a/utils/run.pl
+++ b/utils/run.pl
--- a/utils/score_sclite.sh
+++ b/utils/score_sclite.sh
--- a/utils/spk2utt_to_utt2spk.pl
+++ b/utils/spk2utt_to_utt2spk.pl
--- a/utils/split_data.sh
+++ b/utils/split_data.sh
--- a/utils/split_json.sh
+++ b/utils/split_json.sh
--- a/utils/split_scp.pl
+++ b/utils/split_scp.pl
--- a/utils/train_arpa_with_kenlm.sh
+++ b/utils/train_arpa_with_kenlm.sh
--- a/utils/utility.sh
+++ b/utils/utility.sh
--- a/utils/utt2spk_to_spk2utt.pl
+++ b/utils/utt2spk_to_spk2utt.pl