fix conflict

564b6b68 · huangyuxin · b585684b · 7840806b · 564b6b68 · 564b6b68
128 changed file
--- a/.flake8
+++ b/.flake8
@@ -42,6 +42,10 @@ ignore =
    # these ignores are from flake8-comprehensions; please fix!
    C400,C401,C402,C403,C404,C405,C407,C411,C413,C414,C415

+
+per-file-ignores =
+    */__init__.py: F401
+
 # Specify the list of error codes you wish Flake8 to report.
 select =
    E,

--- a/.notebook/espnet_dataloader.ipynb
+++ b/.notebook/espnet_dataloader.ipynb
--- a/.notebook/jit_infer.ipynb
+++ b/.notebook/jit_infer.ipynb
@@ -83,8 +83,8 @@
    "from deepspeech.frontend.utility import read_manifest\n",
    "from deepspeech.utils.utility import add_arguments, print_arguments\n",
    "\n",
-    "from deepspeech.models.deepspeech2 import DeepSpeech2Model\n",
-    "from deepspeech.models.deepspeech2 import DeepSpeech2InferModel\n",
+    "from deepspeech.models.ds2 import DeepSpeech2Model\n",
+    "from deepspeech.models.ds2 import DeepSpeech2InferModel\n",
    "from deepspeech.io.dataset import ManifestDataset\n",
    "\n",
    "\n",
@@ -669,4 +669,4 @@
 },
 "nbformat": 4,
 "nbformat_minor": 2
-}
\ No newline at end of file
+}
--- a/.notebook/u2_confermer_model_wenet.ipynb
+++ b/.notebook/u2_confermer_model_wenet.ipynb
@@ -3431,7 +3431,7 @@
    "        convolution_layer_args = (output_size, cnn_module_kernel, activation,\n",
    "                                  cnn_module_norm, causal)\n",
    "\n",
-    "        self.encoders = nn.ModuleList([\n",
+    "        self.encoders = nn.LayerList([\n",
    "            ConformerEncoderLayer(\n",
    "                size=output_size,\n",
    "                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n",

--- a/README.md
+++ b/README.md
 [中文版](README_cn.md)

-# PaddlePaddle ASR toolkit
+# PaddlePaddle Speech to Any toolkit

 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
 ![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
 ![support os](https://img.shields.io/badge/os-linux-yellow.svg)

-*PaddleASR* is an open-source implementation of end-to-end Automatic Speech Recognition (ASR) engine, with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient, samller and scalable implementation, including training, inference & testing module, and deployment.
+*DeepSpeech* is an open-source implementation of end-to-end Automatic Speech Recognition engine, with [PaddlePaddle](https://github.com/PaddlePaddle/Paddle) platform. Our vision is to empower both industrial application and academic research on speech recognition, via an easy-to-use, efficient, samller and scalable implementation, including training, inference & testing module, and deployment.


 ## Features
@@ -15,6 +15,8 @@

 ## Setup

+All tested under:  
+* Ubuntu 16.04
 * python>=3.7
 * paddlepaddle>=2.1.2


--- a/README_cn.md
+++ b/README_cn.md
 [English](README.md)

-# PaddlePaddle ASR toolkit
+# PaddlePaddle Speech to Any toolkit

 ![License](https://img.shields.io/badge/license-Apache%202-red.svg)
 ![python version](https://img.shields.io/badge/python-3.7+-orange.svg)
 ![support os](https://img.shields.io/badge/os-linux-yellow.svg)

-*PaddleASR*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别（ASR）引擎的开源项目，
+*DeepSpeech*是一个采用[PaddlePaddle](https://github.com/PaddlePaddle/Paddle)平台的端到端自动语音识别引擎的开源项目，
 我们的愿景是为语音识别在工业应用和学术研究上，提供易于使用、高效、小型化和可扩展的工具，包括训练，推理，以及  部署。

 ## 特性
@@ -16,6 +16,9 @@

 ## 安装

+在以下环境测试验证过：  
+
+* Ubuntu 16.04
 * python>=3.7
 * paddlepaddle>=2.1.2


--- a/deepspeech/__init__.py
+++ b/deepspeech/__init__.py
@@ -30,24 +30,13 @@ logger = Log(__name__).getlog()
 logger.warn = logger.warning

 ########### hcak paddle #############
-paddle.bool = 'bool'
-paddle.float16 = 'float16'
 paddle.half = 'float16'
-paddle.float32 = 'float32'
 paddle.float = 'float32'
-paddle.float64 = 'float64'
 paddle.double = 'float64'
-paddle.int8 = 'int8'
-paddle.int16 = 'int16'
 paddle.short = 'int16'
-paddle.int32 = 'int32'
 paddle.int = 'int32'
-paddle.int64 = 'int64'
 paddle.long = 'int64'
-paddle.uint8 = 'uint8'
 paddle.uint16 = 'uint16'
-paddle.complex64 = 'complex64'
-paddle.complex128 = 'complex128'
 paddle.cdouble = 'complex128'


@@ -363,85 +352,8 @@ if not hasattr(paddle.Tensor, 'tolist'):
        "register user tolist to paddle.Tensor, remove this when fixed!")
    setattr(paddle.Tensor, 'tolist', tolist)

-########### hcak paddle.nn.functional #############
-
-
-def glu(x: paddle.Tensor, axis=-1) -> paddle.Tensor:
-    """The gated linear unit (GLU) activation."""
-    a, b = x.split(2, axis=axis)
-    act_b = F.sigmoid(b)
-    return a * act_b
-
-
-if not hasattr(paddle.nn.functional, 'glu'):
-    logger.warn(
-        "register user glu to paddle.nn.functional, remove this when fixed!")
-    setattr(paddle.nn.functional, 'glu', glu)
-
-# def softplus(x):
-#     """Softplus function."""
-#     if hasattr(paddle.nn.functional, 'softplus'):
-#         #return paddle.nn.functional.softplus(x.float()).type_as(x)
-#         return paddle.nn.functional.softplus(x)
-#     else:
-#         raise NotImplementedError
-
-# def gelu_accurate(x):
-#     """Gaussian Error Linear Units (GELU) activation."""
-#     # [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py
-#     if not hasattr(gelu_accurate, "_a"):
-#         gelu_accurate._a = math.sqrt(2 / math.pi)
-#     return 0.5 * x * (1 + paddle.tanh(gelu_accurate._a *
-#                                       (x + 0.044715 * paddle.pow(x, 3))))
-
-# def gelu(x):
-#     """Gaussian Error Linear Units (GELU) activation."""
-#     if hasattr(nn.functional, 'gelu'):
-#         #return nn.functional.gelu(x.float()).type_as(x)
-#         return nn.functional.gelu(x)
-#     else:
-#         return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0)))
-
-
-# hack loss
-def ctc_loss(logits,
-             labels,
-             input_lengths,
-             label_lengths,
-             blank=0,
-             reduction='mean',
-             norm_by_times=True):
-    #logger.info("my ctc loss with norm by times")
-    ## https://github.com/PaddlePaddle/Paddle/blob/f5ca2db2cc/paddle/fluid/operators/warpctc_op.h#L403
-    loss_out = paddle.fluid.layers.warpctc(logits, labels, blank, norm_by_times,
-                                           input_lengths, label_lengths)
-
-    loss_out = paddle.fluid.layers.squeeze(loss_out, [-1])
-    assert reduction in ['mean', 'sum', 'none']
-    if reduction == 'mean':
-        loss_out = paddle.mean(loss_out / label_lengths)
-    elif reduction == 'sum':
-        loss_out = paddle.sum(loss_out)
-    return loss_out
-
-
-logger.warn(
-    "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!"
-)
-F.ctc_loss = ctc_loss

 ########### hcak paddle.nn #############
-if not hasattr(paddle.nn, 'Module'):
-    logger.warn("register user Module to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'Module', paddle.nn.Layer)
-
-# maybe cause assert isinstance(sublayer, core.Layer)
-if not hasattr(paddle.nn, 'ModuleList'):
-    logger.warn(
-        "register user ModuleList to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'ModuleList', paddle.nn.LayerList)
-
-
 class GLU(nn.Layer):
    """Gated Linear Units (GLU) Layer"""

@@ -450,48 +362,9 @@ class GLU(nn.Layer):
        self.dim = dim

    def forward(self, xs):
-        return glu(xs, dim=self.dim)
+        return F.glu(xs, dim=self.dim)


 if not hasattr(paddle.nn, 'GLU'):
    logger.warn("register user GLU to paddle.nn, remove this when fixed!")
    setattr(paddle.nn, 'GLU', GLU)
-
-
-# TODO(Hui Zhang): remove this Layer
-class ConstantPad2d(nn.Layer):
-    """Pads the input tensor boundaries with a constant value.
-    For N-dimensional padding, use paddle.nn.functional.pad().
-    """
-
-    def __init__(self, padding: Union[tuple, list, int], value: float):
-        """
-        Args:
-            paddle ([tuple]): the size of the padding.
-                If is int, uses the same padding in all boundaries.
-                If a 4-tuple, uses (padding_left, padding_right, padding_top, padding_bottom)
-            value ([flaot]): pad value
-        """
-        self.padding = padding if isinstance(padding,
-                                             [tuple, list]) else [padding] * 4
-        self.value = value
-
-    def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
-        return nn.functional.pad(
-            xs,
-            self.padding,
-            mode='constant',
-            value=self.value,
-            data_format='NCHW')
-
-
-if not hasattr(paddle.nn, 'ConstantPad2d'):
-    logger.warn(
-        "register user ConstantPad2d to paddle.nn, remove this when fixed!")
-    setattr(paddle.nn, 'ConstantPad2d', ConstantPad2d)
-
-########### hcak paddle.jit #############
-
-if not hasattr(paddle.jit, 'export'):
-    logger.warn("register user export to paddle.jit, remove this when fixed!")
-    setattr(paddle.jit, 'export', paddle.jit.to_static)
--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
@@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
 FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')

 FILES = [
-    fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
-                               or fn.endswith('unittest.cc'))
+    fn for fn in FILES
+    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
+        'unittest.cc'))
 ]

 LIBS = ['stdc++']

--- a/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/runtime.py
@@ -23,7 +23,7 @@ from paddle.io import DataLoader
 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
-from deepspeech.models.deepspeech2 import DeepSpeech2Model
+from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils.socket_server import AsrRequestHandler
 from deepspeech.utils.socket_server import AsrTCPServer

--- a/deepspeech/exps/deepspeech2/bin/deploy/server.py
+++ b/deepspeech/exps/deepspeech2/bin/deploy/server.py
@@ -21,7 +21,7 @@ from paddle.io import DataLoader
 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
-from deepspeech.models.deepspeech2 import DeepSpeech2Model
+from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils.socket_server import AsrRequestHandler
 from deepspeech.utils.socket_server import AsrTCPServer

--- a/deepspeech/exps/deepspeech2/bin/export.py
+++ b/deepspeech/exps/deepspeech2/bin/export.py
@@ -30,6 +30,9 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    # save jit model to 
+    parser.add_argument(
+        "--export_path", type=str, help="path of the jit model to save")
    parser.add_argument("--model_type")
    args = parser.parse_args()
    if args.model_type is None:

--- a/deepspeech/exps/deepspeech2/bin/test.py
+++ b/deepspeech/exps/deepspeech2/bin/test.py
@@ -31,6 +31,9 @@ def main(config, args):
 if __name__ == "__main__":
    parser = default_argument_parser()
    parser.add_argument("--model_type")
+    # save asr result to 
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())
    if args.model_type is None:

--- a/deepspeech/exps/deepspeech2/bin/test_export.py
+++ b/deepspeech/exps/deepspeech2/bin/test_export.py
@@ -30,6 +30,12 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    # save asr result to
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
+    #load jit model from
+    parser.add_argument(
+        "--export_path", type=str, help="path of the jit model to save")
    parser.add_argument("--model_type")
    args = parser.parse_args()
    print_arguments(args, globals())

--- a/deepspeech/exps/deepspeech2/bin/tune.py
+++ b/deepspeech/exps/deepspeech2/bin/tune.py
@@ -21,7 +21,7 @@ from paddle.io import DataLoader
 from deepspeech.exps.deepspeech2.config import get_cfg_defaults
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
-from deepspeech.models.deepspeech2 import DeepSpeech2Model
+from deepspeech.models.ds2 import DeepSpeech2Model
 from deepspeech.training.cli import default_argument_parser
 from deepspeech.utils import error_rate
 from deepspeech.utils.utility import add_arguments

--- a/deepspeech/exps/u2/bin/alignment.py
+++ b/deepspeech/exps/u2/bin/alignment.py
@@ -30,6 +30,9 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    # save asr result to 
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())


--- a/deepspeech/exps/u2/bin/export.py
+++ b/deepspeech/exps/u2/bin/export.py
@@ -30,6 +30,9 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    # save jit model to 
+    parser.add_argument(
+        "--export_path", type=str, help="path of the jit model to save")
    args = parser.parse_args()
    print_arguments(args, globals())


--- a/deepspeech/exps/u2/bin/test.py
+++ b/deepspeech/exps/u2/bin/test.py
@@ -34,6 +34,9 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    # save asr result to 
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())


--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -264,12 +264,12 @@ class U2Trainer(Trainer):
        config.data.manifest = config.data.test_manifest
        # filter test examples, will cause less examples, but no mismatch with training
        # and can use large batch size , save training time, so filter test egs now.
-        # config.data.min_input_len = 0.0  # second
-        # config.data.max_input_len = float('inf')  # second
-        # config.data.min_output_len = 0.0  # tokens
-        # config.data.max_output_len = float('inf')  # tokens
-        # config.data.min_output_input_ratio = 0.00
-        # config.data.max_output_input_ratio = float('inf')
+        config.data.min_input_len = 0.0  # second
+        config.data.max_input_len = float('inf')  # second
+        config.data.min_output_len = 0.0  # tokens
+        config.data.max_output_len = float('inf')  # tokens
+        config.data.min_output_input_ratio = 0.00
+        config.data.max_output_input_ratio = float('inf')

        test_dataset = ManifestDataset.from_config(config)
        # return text ord id

--- a/deepspeech/exps/u2_kaldi/__init__.py
+++ b/deepspeech/exps/u2_kaldi/__init__.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/deepspeech/exps/u2_kaldi/bin/test.py
+++ b/deepspeech/exps/u2_kaldi/bin/test.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Evaluation for U2 model."""
+import cProfile
+
+from yacs.config import CfgNode
+
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.utility import print_arguments
+
+model_test_alias = {
+    "u2": "deepspeech.exps.u2.model:U2Tester",
+    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Tester",
+}
+
+
+def main_sp(config, args):
+    class_obj = dynamic_import(args.model_name, model_test_alias)
+    exp = class_obj(config, args)
+    exp.setup()
+
+    if args.run_mode == 'test':
+        exp.run_test()
+    elif args.run_mode == 'export':
+        exp.run_export()
+    elif args.run_mode == 'align':
+        exp.run_align()
+
+
+def main(config, args):
+    main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default='u2_kaldi',
+        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
+    parser.add_argument(
+        '--run-mode',
+        type=str,
+        default='test',
+        help='run mode, e.g. test, align, export')
+    parser.add_argument(
+        '--dict-path', type=str, default=None, help='dict path.')
+    # save asr result to 
+    parser.add_argument(
+        "--result-file", type=str, help="path of save the asr result")
+    # save jit model to 
+    parser.add_argument(
+        "--export-path", type=str, help="path of the jit model to save")
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    config = CfgNode()
+    config.set_new_allowed(True)
+    config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats('test.profile')
--- a/deepspeech/exps/u2_kaldi/bin/train.py
+++ b/deepspeech/exps/u2_kaldi/bin/train.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Trainer for U2 model."""
+import cProfile
+import os
+
+from paddle import distributed as dist
+from yacs.config import CfgNode
+
+from deepspeech.training.cli import default_argument_parser
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.utility import print_arguments
+
+model_train_alias = {
+    "u2": "deepspeech.exps.u2.model:U2Trainer",
+    "u2_kaldi": "deepspeech.exps.u2_kaldi.model:U2Trainer",
+}
+
+
+def main_sp(config, args):
+    class_obj = dynamic_import(args.model_name, model_train_alias)
+    exp = class_obj(config, args)
+    exp.setup()
+    exp.run()
+
+
+def main(config, args):
+    if args.device == "gpu" and args.nprocs > 1:
+        dist.spawn(main_sp, args=(config, args), nprocs=args.nprocs)
+    else:
+        main_sp(config, args)
+
+
+if __name__ == "__main__":
+    parser = default_argument_parser()
+    parser.add_argument(
+        '--model-name',
+        type=str,
+        default='u2_kaldi',
+        help='model name, e.g: deepspeech2, u2, u2_kaldi, u2_st')
+    args = parser.parse_args()
+    print_arguments(args, globals())
+
+    config = CfgNode()
+    config.set_new_allowed(True)
+    config.merge_from_file(args.config)
+    if args.opts:
+        config.merge_from_list(args.opts)
+    config.freeze()
+    print(config)
+    if args.dump_config:
+        with open(args.dump_config, 'w') as f:
+            print(config, file=f)
+
+    # Setting for profiling
+    pr = cProfile.Profile()
+    pr.runcall(main, config, args)
+    pr.dump_stats(os.path.join(args.output, 'train.profile'))
--- a/deepspeech/exps/u2_kaldi/model.py
+++ b/deepspeech/exps/u2_kaldi/model.py
--- a/deepspeech/exps/u2_st/bin/export.py
+++ b/deepspeech/exps/u2_st/bin/export.py
@@ -30,6 +30,9 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    # save jit model to 
+    parser.add_argument(
+        "--export_path", type=str, help="path of the jit model to save")
    args = parser.parse_args()
    print_arguments(args, globals())


--- a/deepspeech/exps/u2_st/bin/test.py
+++ b/deepspeech/exps/u2_st/bin/test.py
@@ -34,6 +34,9 @@ def main(config, args):

 if __name__ == "__main__":
    parser = default_argument_parser()
+    # save asr result to 
+    parser.add_argument(
+        "--result_file", type=str, help="path of save the asr result")
    args = parser.parse_args()
    print_arguments(args, globals())


--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@@ -13,18 +13,28 @@
 # limitations under the License.
 """Contains the data augmentation pipeline."""
 import json
+from collections.abc import Sequence
+from inspect import signature

 import numpy as np

-from deepspeech.frontend.augmentor.impulse_response import ImpulseResponseAugmentor
-from deepspeech.frontend.augmentor.noise_perturb import NoisePerturbAugmentor
-from deepspeech.frontend.augmentor.online_bayesian_normalization import \
-    OnlineBayesianNormalizationAugmentor
-from deepspeech.frontend.augmentor.resample import ResampleAugmentor
-from deepspeech.frontend.augmentor.shift_perturb import ShiftPerturbAugmentor
-from deepspeech.frontend.augmentor.spec_augment import SpecAugmentor
-from deepspeech.frontend.augmentor.speed_perturb import SpeedPerturbAugmentor
-from deepspeech.frontend.augmentor.volume_perturb import VolumePerturbAugmentor
+from deepspeech.frontend.augmentor.base import AugmentorBase
+from deepspeech.utils.dynamic_import import dynamic_import
+from deepspeech.utils.log import Log
+
+__all__ = ["AugmentationPipeline"]
+
+logger = Log(__name__).getlog()
+
+import_alias = dict(
+    volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor",
+    shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor",
+    speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor",
+    resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor",
+    bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor",
+    noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor",
+    impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor",
+    specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", )


 class AugmentationPipeline():
@@ -78,20 +88,74 @@ class AugmentationPipeline():
    augmentor to take effect. If "prob" is zero, the augmentor does not take
    effect.

-    :param augmentation_config: Augmentation configuration in json string.
-    :type augmentation_config: str
-    :param random_seed: Random seed.
-    :type random_seed: int
-    :raises ValueError: If the augmentation json config is in incorrect format".
+    Params:
+        augmentation_config(str): Augmentation configuration in json string.
+        random_seed(int): Random seed.
+        train(bool): whether is train mode.
+    
+    Raises:
+        ValueError: If the augmentation json config is in incorrect format".
    """

-    def __init__(self, augmentation_config: str, random_seed=0):
+    SPEC_TYPES = {'specaug'}
+
+    def __init__(self, augmentation_config: str, random_seed: int=0):
        self._rng = np.random.RandomState(random_seed)
-        self._spec_types = ('specaug')
-        self._augmentors, self._rates = self._parse_pipeline_from(
-            augmentation_config, 'audio')
+        self.conf = {'mode': 'sequential', 'process': []}
+        if augmentation_config:
+            process = json.loads(augmentation_config)
+            self.conf['process'] += process
+
+        self._augmentors, self._rates = self._parse_pipeline_from('all')
+        self._audio_augmentors, self._audio_rates = self._parse_pipeline_from(
+            'audio')
        self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
-            augmentation_config, 'feature')
+            'feature')
+
+    def __call__(self, xs, uttid_list=None, **kwargs):
+        if not isinstance(xs, Sequence):
+            is_batch = False
+            xs = [xs]
+        else:
+            is_batch = True
+
+        if isinstance(uttid_list, str):
+            uttid_list = [uttid_list for _ in range(len(xs))]
+
+        if self.conf.get("mode", "sequential") == "sequential":
+            for idx, (func, rate) in enumerate(
+                    zip(self._augmentors, self._rates), 0):
+                if self._rng.uniform(0., 1.) >= rate:
+                    continue
+
+                # Derive only the args which the func has
+                try:
+                    param = signature(func).parameters
+                except ValueError:
+                    # Some function, e.g. built-in function, are failed
+                    param = {}
+                _kwargs = {k: v for k, v in kwargs.items() if k in param}
+
+                try:
+                    if uttid_list is not None and "uttid" in param:
+                        xs = [
+                            func(x, u, **_kwargs)
+                            for x, u in zip(xs, uttid_list)
+                        ]
+                    else:
+                        xs = [func(x, **_kwargs) for x in xs]
+                except Exception:
+                    logger.fatal("Catch a exception from {}th func: {}".format(
+                        idx, func))
+                    raise
+        else:
+            raise NotImplementedError(
+                "Not supporting mode={}".format(self.conf["mode"]))
+
+        if is_batch:
+            return xs
+        else:
+            return xs[0]

    def transform_audio(self, audio_segment):
        """Run the pre-processing pipeline for data augmentation.
@@ -101,7 +165,7 @@ class AugmentationPipeline():
        :param audio_segment: Audio segment to process.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
-        for augmentor, rate in zip(self._augmentors, self._rates):
+        for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
            if self._rng.uniform(0., 1.) < rate:
                augmentor.transform_audio(audio_segment)

@@ -116,52 +180,39 @@ class AugmentationPipeline():
                spec_segment = augmentor.transform_feature(spec_segment)
        return spec_segment

-    def _parse_pipeline_from(self, config_json, aug_type='audio'):
+    def _parse_pipeline_from(self, aug_type='all'):
        """Parse the config json to build a augmentation pipelien."""
-        assert aug_type in ('audio', 'feature'), aug_type
-        try:
-            configs = json.loads(config_json)
-            audio_confs = []
-            feature_confs = []
-            for config in configs:
-                if config["type"] in self._spec_types:
-                    feature_confs.append(config)
-                else:
-                    audio_confs.append(config)
-
-            if aug_type == 'audio':
-                aug_confs = audio_confs
-            elif aug_type == 'feature':
-                aug_confs = feature_confs
-
-            augmentors = [
-                self._get_augmentor(config["type"], config["params"])
-                for config in aug_confs
-            ]
-            rates = [config["prob"] for config in aug_confs]
-
-        except Exception as e:
-            raise ValueError("Failed to parse the augmentation config json: "
-                             "%s" % str(e))
+        assert aug_type in ('audio', 'feature', 'all'), aug_type
+        audio_confs = []
+        feature_confs = []
+        all_confs = []
+        for config in self.conf['process']:
+            all_confs.append(config)
+            if config["type"] in self.SPEC_TYPES:
+                feature_confs.append(config)
+            else:
+                audio_confs.append(config)
+
+        if aug_type == 'audio':
+            aug_confs = audio_confs
+        elif aug_type == 'feature':
+            aug_confs = feature_confs
+        else:
+            aug_confs = all_confs
+
+        augmentors = [
+            self._get_augmentor(config["type"], config["params"])
+            for config in aug_confs
+        ]
+        rates = [config["prob"] for config in aug_confs]
        return augmentors, rates

    def _get_augmentor(self, augmentor_type, params):
        """Return an augmentation model by the type name, and pass in params."""
-        if augmentor_type == "volume":
-            return VolumePerturbAugmentor(self._rng, **params)
-        elif augmentor_type == "shift":
-            return ShiftPerturbAugmentor(self._rng, **params)
-        elif augmentor_type == "speed":
-            return SpeedPerturbAugmentor(self._rng, **params)
-        elif augmentor_type == "resample":
-            return ResampleAugmentor(self._rng, **params)
-        elif augmentor_type == "bayesian_normal":
-            return OnlineBayesianNormalizationAugmentor(self._rng, **params)
-        elif augmentor_type == "noise":
-            return NoisePerturbAugmentor(self._rng, **params)
-        elif augmentor_type == "impulse":
-            return ImpulseResponseAugmentor(self._rng, **params)
-        elif augmentor_type == "specaug":
-            return SpecAugmentor(self._rng, **params)
-        else:
+        class_obj = dynamic_import(augmentor_type, import_alias)
+        assert issubclass(class_obj, AugmentorBase)
+        try:
+            obj = class_obj(self._rng, **params)
+        except Exception:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
+        return obj
--- a/deepspeech/frontend/augmentor/base.py
+++ b/deepspeech/frontend/augmentor/base.py
@@ -28,6 +28,10 @@ class AugmentorBase():
    def __init__(self):
        pass

+    @abstractmethod
+    def __call__(self, xs):
+        raise NotImplementedError("AugmentorBase: Not impl __call__")
+
    @abstractmethod
    def transform_audio(self, audio_segment):
        """Adds various effects to the input audio segment. Such effects
@@ -40,7 +44,7 @@ class AugmentorBase():
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
-        raise NotImplementedError
+        raise NotImplementedError("AugmentorBase: Not impl transform_audio")

    @abstractmethod
    def transform_feature(self, spec_segment):
@@ -52,4 +56,4 @@ class AugmentorBase():
        Args:
            spec_segment (Spectrogram): Spectrogram segment to add effects to.
        """
-        raise NotImplementedError
+        raise NotImplementedError("AugmentorBase: Not impl transform_feature")
--- a/deepspeech/frontend/augmentor/impulse_response.py
+++ b/deepspeech/frontend/augmentor/impulse_response.py
@@ -30,6 +30,12 @@ class ImpulseResponseAugmentor(AugmentorBase):
        self._rng = rng
        self._impulse_manifest = read_manifest(impulse_manifest_path)

+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
    def transform_audio(self, audio_segment):
        """Add impulse response effect.


--- a/deepspeech/frontend/augmentor/noise_perturb.py
+++ b/deepspeech/frontend/augmentor/noise_perturb.py
@@ -36,6 +36,12 @@ class NoisePerturbAugmentor(AugmentorBase):
        self._rng = rng
        self._noise_manifest = read_manifest(manifest_path=noise_manifest_path)

+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
    def transform_audio(self, audio_segment):
        """Add background noise audio.


--- a/deepspeech/frontend/augmentor/online_bayesian_normalization.py
+++ b/deepspeech/frontend/augmentor/online_bayesian_normalization.py
@@ -44,6 +44,12 @@ class OnlineBayesianNormalizationAugmentor(AugmentorBase):
        self._rng = rng
        self._startup_delay = startup_delay

+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
    def transform_audio(self, audio_segment):
        """Normalizes the input audio using the online Bayesian approach.


--- a/deepspeech/frontend/augmentor/resample.py
+++ b/deepspeech/frontend/augmentor/resample.py
@@ -31,6 +31,12 @@ class ResampleAugmentor(AugmentorBase):
        self._new_sample_rate = new_sample_rate
        self._rng = rng

+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
    def transform_audio(self, audio_segment):
        """Resamples the input audio to a target sample rate.


--- a/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/deepspeech/frontend/augmentor/shift_perturb.py
@@ -31,6 +31,12 @@ class ShiftPerturbAugmentor(AugmentorBase):
        self._max_shift_ms = max_shift_ms
        self._rng = rng

+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
    def transform_audio(self, audio_segment):
        """Shift audio.


--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@@ -12,7 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """Contains the volume perturb augmentation model."""
+import random
+
 import numpy as np
+from PIL import Image
+from PIL.Image import BICUBIC

 from deepspeech.frontend.augmentor.base import AugmentorBase
 from deepspeech.utils.log import Log
@@ -41,7 +45,9 @@ class SpecAugmentor(AugmentorBase):
                 W=40,
                 adaptive_number_ratio=0,
                 adaptive_size_ratio=0,
-                 max_n_time_masks=20):
+                 max_n_time_masks=20,
+                 replace_with_zero=True,
+                 warp_mode='PIL'):
        """SpecAugment class.
        Args:
            rng (random.Random): random generator object.
@@ -54,17 +60,22 @@ class SpecAugmentor(AugmentorBase):
            adaptive_number_ratio (float): adaptive multiplicity ratio for time masking
            adaptive_size_ratio (float): adaptive size ratio for time masking
            max_n_time_masks (int): maximum number of time masking
+            replace_with_zero (bool): pad zero on mask if true else use mean
+            warp_mode (str):  "PIL" (default, fast, not differentiable) 
+                 or "sparse_image_warp" (slow, differentiable)
        """
        super().__init__()
        self._rng = rng
+        self.inplace = True
+        self.replace_with_zero = replace_with_zero

+        self.mode = warp_mode
        self.W = W
        self.F = F
        self.T = T
        self.n_freq_masks = n_freq_masks
        self.n_time_masks = n_time_masks
        self.p = p
-        #logger.info(f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}")

        # adaptive SpecAugment
        self.adaptive_number_ratio = adaptive_number_ratio
@@ -121,21 +132,86 @@ class SpecAugmentor(AugmentorBase):
    def time_mask(self):
        return self._time_mask

-    def time_warp(xs, W=40):
-        raise NotImplementedError
+    def __repr__(self):
+        return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}"
+
+    def time_warp(self, x, mode='PIL'):
+        """time warp for spec augment
+        move random center frame by the random width ~ uniform(-window, window)
+
+        Args:
+            x (np.ndarray): spectrogram (time, freq)
+            mode (str): PIL or sparse_image_warp
+
+        Raises:
+            NotImplementedError: [description]
+            NotImplementedError: [description]
+
+        Returns:
+            np.ndarray: time warped spectrogram (time, freq)
+        """
+        window = max_time_warp = self.W
+        if window == 0:
+            return x
+        
+        if mode == "PIL":
+            t = x.shape[0]
+            if t - window <= window:
+                return x
+            # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1
+            center = random.randrange(window, t - window)
+            warped = random.randrange(center - window, center +
+                                      window) + 1  # 1 ... t - 1
+
+            left = Image.fromarray(x[:center]).resize((x.shape[1], warped),
+                                                      BICUBIC)
+            right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped),
+                                                       BICUBIC)
+            if self.inplace:
+                x[:warped] = left
+                x[warped:] = right
+                return x
+            return np.concatenate((left, right), 0)
+        elif mode == "sparse_image_warp":
+            raise NotImplementedError('sparse_image_warp')
+        else:
+            raise NotImplementedError(
+                "unknown resize mode: " + mode +
+                ", choose one from (PIL, sparse_image_warp).")
+
+    def mask_freq(self, x, replace_with_zero=False):
+        """freq mask
+
+        Args:
+            x (np.ndarray): spectrogram (time, freq)
+            replace_with_zero (bool, optional): Defaults to False.

-    def mask_freq(self, xs, replace_with_zero=False):
-        n_bins = xs.shape[0]
+        Returns:
+            np.ndarray: freq mask spectrogram (time, freq)
+        """
+        n_bins = x.shape[1]
        for i in range(0, self.n_freq_masks):
            f = int(self._rng.uniform(low=0, high=self.F))
            f_0 = int(self._rng.uniform(low=0, high=n_bins - f))
-            xs[f_0:f_0 + f, :] = 0
            assert f_0 <= f_0 + f
+            if replace_with_zero:
+                x[:, f_0:f_0 + f] = 0
+            else:
+                x[:, f_0:f_0 + f] = x.mean()
            self._freq_mask = (f_0, f_0 + f)
-        return xs
+        return x

-    def mask_time(self, xs, replace_with_zero=False):
-        n_frames = xs.shape[1]
+    def mask_time(self, x, replace_with_zero=False):
+        """time mask
+
+        Args:
+            x (np.ndarray): spectrogram (time, freq)
+            replace_with_zero (bool, optional): Defaults to False.
+
+        Returns:
+            np.ndarray: time mask spectrogram (time, freq)
+        """
+        n_frames = x.shape[0]

        if self.adaptive_number_ratio > 0:
            n_masks = int(n_frames * self.adaptive_number_ratio)
@@ -152,19 +228,29 @@ class SpecAugmentor(AugmentorBase):
            t = int(self._rng.uniform(low=0, high=T))
            t = min(t, int(n_frames * self.p))
            t_0 = int(self._rng.uniform(low=0, high=n_frames - t))
-            xs[:, t_0:t_0 + t] = 0
            assert t_0 <= t_0 + t
+            if replace_with_zero:
+                x[t_0:t_0 + t, :] = 0
+            else:
+                x[t_0:t_0 + t, :] = x.mean()
            self._time_mask = (t_0, t_0 + t)
-        return xs
+        return x
+
+    def __call__(self, x, train=True):
+        if not train:
+            return x
+        return self.transform_feature(x)

-    def transform_feature(self, xs: np.ndarray):
+    def transform_feature(self, x: np.ndarray):
        """
        Args:
-            xs (FloatTensor): `[F, T]`
+            x (np.ndarray): `[T, F]`
        Returns:
-            xs (FloatTensor): `[F, T]`
+            x (np.ndarray): `[T, F]`
        """
-        # xs = self.time_warp(xs)
-        xs = self.mask_freq(xs)
-        xs = self.mask_time(xs)
-        return xs
+        assert isinstance(x, np.ndarray)
+        assert x.ndim == 2
+        x = self.time_warp(x, self.mode)
+        x = self.mask_freq(x, self.replace_with_zero)
+        x = self.mask_time(x, self.replace_with_zero)
+        return x
--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
@@ -79,6 +79,12 @@ class SpeedPerturbAugmentor(AugmentorBase):
            self._rates = np.linspace(
                self._min_rate, self._max_rate, self._num_rates, endpoint=True)

+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
    def transform_audio(self, audio_segment):
        """Sample a new speed rate from the given range and
        changes the speed of the given audio clip.

--- a/deepspeech/frontend/augmentor/volume_perturb.py
+++ b/deepspeech/frontend/augmentor/volume_perturb.py
@@ -37,6 +37,12 @@ class VolumePerturbAugmentor(AugmentorBase):
        self._max_gain_dBFS = max_gain_dBFS
        self._rng = rng

+    def __call__(self, x, uttid=None, train=True):
+        if not train:
+            return x
+        self.transform_audio(x)
+        return x
+
    def transform_audio(self, audio_segment):
        """Change audio loadness.


--- a/deepspeech/frontend/featurizer/__init__.py
+++ b/deepspeech/frontend/featurizer/__init__.py
@@ -11,3 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .audio_featurizer import AudioFeaturizer  #noqa: F401
+from .speech_featurizer import SpeechFeaturizer
+from .text_featurizer import TextFeaturizer
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -18,7 +18,7 @@ from python_speech_features import logfbank
 from python_speech_features import mfcc


-class AudioFeaturizer(object):
+class AudioFeaturizer():
    """Audio featurizer, for extracting features from audio contents of
    AudioSegment or SpeechSegment.

@@ -167,32 +167,6 @@ class AudioFeaturizer(object):
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)

-    def _compute_linear_specgram(self,
-                                 samples,
-                                 sample_rate,
-                                 stride_ms=10.0,
-                                 window_ms=20.0,
-                                 max_freq=None,
-                                 eps=1e-14):
-        """Compute the linear spectrogram from FFT energy."""
-        if max_freq is None:
-            max_freq = sample_rate / 2
-        if max_freq > sample_rate / 2:
-            raise ValueError("max_freq must not be greater than half of "
-                             "sample rate.")
-        if stride_ms > window_ms:
-            raise ValueError("Stride size must not be greater than "
-                             "window size.")
-        stride_size = int(0.001 * sample_rate * stride_ms)
-        window_size = int(0.001 * sample_rate * window_ms)
-        specgram, freqs = self._specgram_real(
-            samples,
-            window_size=window_size,
-            stride_size=stride_size,
-            sample_rate=sample_rate)
-        ind = np.where(freqs <= max_freq)[0][-1] + 1
-        return np.log(specgram[:ind, :] + eps)
-
    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""
        # extract strided windows
@@ -217,26 +191,65 @@ class AudioFeaturizer(object):
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs

+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Compute the linear spectrogram from FFT energy.
+
+        Args:
+            samples ([type]): [description]
+            sample_rate ([type]): [description]
+            stride_ms (float, optional): [description]. Defaults to 10.0.
+            window_ms (float, optional): [description]. Defaults to 20.0.
+            max_freq ([type], optional): [description]. Defaults to None.
+            eps ([type], optional): [description]. Defaults to 1e-14.
+
+        Raises:
+            ValueError: [description]
+            ValueError: [description]
+
+        Returns:
+            np.ndarray: log spectrogram, (time, freq)
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        # (freq, time)
+        spec = np.log(specgram[:ind, :] + eps)
+        return np.transpose(spec)
+
    def _concat_delta_delta(self, feat):
        """append delat, delta-delta feature.

        Args:
-            feat (np.ndarray): (D, T)
+            feat (np.ndarray): (T, D)

        Returns:
-            np.ndarray: feat with delta-delta, (3*D, T)
+            np.ndarray: feat with delta-delta, (T, 3*D)
        """
-        feat = np.transpose(feat)
        # Deltas
        d_feat = delta(feat, 2)
        # Deltas-Deltas
        dd_feat = delta(feat, 2)
-        # transpose
-        feat = np.transpose(feat)
-        d_feat = np.transpose(d_feat)
-        dd_feat = np.transpose(dd_feat)
        # concat above three features
-        concat_feat = np.concatenate((feat, d_feat, dd_feat))
+        concat_feat = np.concatenate((feat, d_feat, dd_feat), axis=1)
        return concat_feat

    def _compute_mfcc(self,
@@ -292,7 +305,6 @@ class AudioFeaturizer(object):
            ceplifter=22,
            useEnergy=True,
            winfunc='povey')
-        mfcc_feat = np.transpose(mfcc_feat)
        if delta_delta:
            mfcc_feat = self._concat_delta_delta(mfcc_feat)
        return mfcc_feat
@@ -346,8 +358,6 @@ class AudioFeaturizer(object):
            remove_dc_offset=True,
            preemph=0.97,
            wintype='povey')
-
-        fbank_feat = np.transpose(fbank_feat)
        if delta_delta:
            fbank_feat = self._concat_delta_delta(fbank_feat)
        return fbank_feat
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -16,7 +16,7 @@ from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer


-class SpeechFeaturizer(object):
+class SpeechFeaturizer():
    """Speech featurizer, for extracting features from both audio and transcript
    contents of SpeechSegment.


--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@@ -14,12 +14,19 @@
 """Contains the text featurizer class."""
 import sentencepiece as spm

-from deepspeech.frontend.utility import EOS
-from deepspeech.frontend.utility import UNK
+from ..utility import EOS
+from ..utility import load_dict
+from ..utility import UNK

+__all__ = ["TextFeaturizer"]

-class TextFeaturizer(object):
-    def __init__(self, unit_type, vocab_filepath, spm_model_prefix=None):
+
+class TextFeaturizer():
+    def __init__(self,
+                 unit_type,
+                 vocab_filepath,
+                 spm_model_prefix=None,
+                 maskctc=False):
        """Text featurizer, for processing or extracting features from text.

        Currently, it supports char/word/sentence-piece level tokenizing and conversion into
@@ -34,11 +41,12 @@ class TextFeaturizer(object):
        assert unit_type in ('char', 'spm', 'word')
        self.unit_type = unit_type
        self.unk = UNK
+        self.maskctc = maskctc
+
        if vocab_filepath:
-            self._vocab_dict, self._id2token, self._vocab_list = self._load_vocabulary_from_file(
-                vocab_filepath)
-            self.unk_id = self._vocab_list.index(self.unk)
-            self.eos_id = self._vocab_list.index(EOS)
+            self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id = self._load_vocabulary_from_file(
+                vocab_filepath, maskctc)
+            self.vocab_size = len(self.vocab_list)

        if unit_type == 'spm':
            spm_model = spm_model_prefix + '.model'
@@ -67,7 +75,7 @@ class TextFeaturizer(object):
        """Convert text string to a list of token indices.

        Args:
-            text (str): Text to process.
+            text (str): Text.
        
        Returns:
            List[int]: List of token indices.
@@ -75,8 +83,8 @@ class TextFeaturizer(object):
        tokens = self.tokenize(text)
        ids = []
        for token in tokens:
-            token = token if token in self._vocab_dict else self.unk
-            ids.append(self._vocab_dict[token])
+            token = token if token in self.vocab_dict else self.unk
+            ids.append(self.vocab_dict[token])
        return ids

    def defeaturize(self, idxs):
@@ -87,7 +95,7 @@ class TextFeaturizer(object):
            idxs (List[int]): List of token indices.

        Returns:
-            str: Text to process.
+            str: Text.
        """
        tokens = []
        for idx in idxs:
@@ -97,33 +105,6 @@ class TextFeaturizer(object):
        text = self.detokenize(tokens)
        return text

-    @property
-    def vocab_size(self):
-        """Return the vocabulary size.
-
-        :return: Vocabulary size.
-        :rtype: int
-        """
-        return len(self._vocab_list)
-
-    @property
-    def vocab_list(self):
-        """Return the vocabulary in list.
-
-        Returns:
-            List[str]: tokens.
-        """
-        return self._vocab_list
-
-    @property
-    def vocab_dict(self):
-        """Return the vocabulary in dict.
-
-        Returns:
-            Dict[str, int]: token str -> int
-        """
-        return self._vocab_dict
-
    def char_tokenize(self, text):
        """Character tokenizer.

@@ -206,14 +187,16 @@ class TextFeaturizer(object):

        return decode(tokens)

-    def _load_vocabulary_from_file(self, vocab_filepath):
+    def _load_vocabulary_from_file(self, vocab_filepath: str, maskctc: bool):
        """Load vocabulary from file."""
-        vocab_lines = []
-        with open(vocab_filepath, 'r', encoding='utf-8') as file:
-            vocab_lines.extend(file.readlines())
-        vocab_list = [line[:-1] for line in vocab_lines]
+        vocab_list = load_dict(vocab_filepath, maskctc)
+        assert vocab_list is not None
+
        id2token = dict(
            [(idx, token) for (idx, token) in enumerate(vocab_list)])
        token2id = dict(
            [(token, idx) for (idx, token) in enumerate(vocab_list)])
-        return token2id, id2token, vocab_list
+
+        unk_id = vocab_list.index(UNK)
+        eos_id = vocab_list.index(EOS)
+        return token2id, id2token, vocab_list, unk_id, eos_id
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -40,21 +40,21 @@ class CollateFunc(object):
        number = 0
        for item in batch:
            audioseg = AudioSegment.from_file(item['feat'])
-            feat = self.feature_func(audioseg)  #(D, T)
+            feat = self.feature_func(audioseg)  #(T, D)

-            sums = np.sum(feat, axis=1)
+            sums = np.sum(feat, axis=0)
            if mean_stat is None:
                mean_stat = sums
            else:
                mean_stat += sums

-            square_sums = np.sum(np.square(feat), axis=1)
+            square_sums = np.sum(np.square(feat), axis=0)
            if var_stat is None:
                var_stat = square_sums
            else:
                var_stat += square_sums

-            number += feat.shape[1]
+            number += feat.shape[0]
        return number, mean_stat, var_stat


@@ -120,7 +120,7 @@ class FeatureNormalizer(object):
        """Normalize features to be of zero mean and unit stddev.

        :param features: Input features to be normalized.
-        :type features: ndarray, shape (D, T)
+        :type features: ndarray, shape (T, D)
        :param eps:  added to stddev to provide numerical stablibity.
        :type eps: float
        :return: Normalized features.
@@ -131,8 +131,8 @@ class FeatureNormalizer(object):
    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
        mean, istd = load_cmvn(filepath, filetype='json')
-        self._mean = np.expand_dims(mean, axis=-1)
-        self._istd = np.expand_dims(istd, axis=-1)
+        self._mean = np.expand_dims(mean, axis=0)
+        self._istd = np.expand_dims(istd, axis=0)

    def write_to_file(self, filepath):
        """Write the mean and stddev to the file.

--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -15,6 +15,9 @@
 import codecs
 import json
 import math
+from typing import List
+from typing import Optional
+from typing import Text

 import numpy as np

@@ -23,16 +26,35 @@ from deepspeech.utils.log import Log
 logger = Log(__name__).getlog()

 __all__ = [
-    "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", "max_dbfs",
-    "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", "EOS", "UNK",
-    "BLANK"
+    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
+    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
+    "EOS", "UNK", "BLANK", "MASKCTC"
 ]

 IGNORE_ID = -1
-SOS = "<sos/eos>"
+# `sos` and `eos` using same token
+SOS = "<eos>"
 EOS = SOS
 UNK = "<unk>"
 BLANK = "<blank>"
+MASKCTC = "<mask>"
+
+
+def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:
+    if dict_path is None:
+        return None
+
+    with open(dict_path, "r") as f:
+        dictionary = f.readlines()
+    char_list = [entry.strip().split(" ")[0] for entry in dictionary]
+    if BLANK not in char_list:
+        char_list.insert(0, BLANK)
+    if EOS not in char_list:
+        char_list.append(EOS)
+    # for non-autoregressive maskctc model
+    if maskctc and MASKCTC not in char_list:
+        char_list.append(MASKCTC)
+    return char_list


 def read_manifest(
@@ -47,12 +69,20 @@ def read_manifest(

    Args:
        manifest_path ([type]): Manifest file to load and parse.
-        max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
-        min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
-        max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
-        min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
-        max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
-        min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
+        max_input_len ([type], optional): maximum output seq length, 
+            in seconds for raw wav, in frame numbers for feature data. 
+            Defaults to float('inf').
+        min_input_len (float, optional): minimum input seq length, 
+            in seconds for raw wav, in frame numbers for feature data. 
+            Defaults to 0.0.
+        max_output_len (float, optional): maximum input seq length, 
+            in modeling units. Defaults to 500.0.
+        min_output_len (float, optional): minimum input seq length, 
+            in modeling units. Defaults to 0.0.
+        max_output_input_ratio (float, optional): 
+            maximum output seq length/output seq length ratio. Defaults to 10.0.
+        min_output_input_ratio (float, optional): 
+            minimum output seq length/output seq length ratio. Defaults to 0.05.

    Raises:
        IOError: If failed to parse the manifest.

--- a/deepspeech/io/__init__.py
+++ b/deepspeech/io/__init__.py
@@ -11,139 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
-from paddle.io import DataLoader
-
-from deepspeech.io.collator import SpeechCollator
-from deepspeech.io.dataset import ManifestDataset
-from deepspeech.io.sampler import SortagradBatchSampler
-from deepspeech.io.sampler import SortagradDistributedBatchSampler
-
-
-def create_dataloader(manifest_path,
-                      unit_type,
-                      vocab_filepath,
-                      mean_std_filepath,
-                      spm_model_prefix,
-                      augmentation_config='{}',
-                      max_input_len=float('inf'),
-                      min_input_len=0.0,
-                      max_output_len=float('inf'),
-                      min_output_len=0.0,
-                      max_output_input_ratio=float('inf'),
-                      min_output_input_ratio=0.0,
-                      stride_ms=10.0,
-                      window_ms=20.0,
-                      max_freq=None,
-                      specgram_type='linear',
-                      feat_dim=None,
-                      delta_delta=False,
-                      use_dB_normalization=True,
-                      random_seed=0,
-                      keep_transcription_text=False,
-                      is_training=False,
-                      batch_size=1,
-                      num_workers=0,
-                      sortagrad=False,
-                      shuffle_method=None,
-                      dist=False):
-
-    dataset = ManifestDataset(
-        manifest_path=manifest_path,
-        unit_type=unit_type,
-        vocab_filepath=vocab_filepath,
-        mean_std_filepath=mean_std_filepath,
-        spm_model_prefix=spm_model_prefix,
-        augmentation_config=augmentation_config,
-        max_input_len=max_input_len,
-        min_input_len=min_input_len,
-        max_output_len=max_output_len,
-        min_output_len=min_output_len,
-        max_output_input_ratio=max_output_input_ratio,
-        min_output_input_ratio=min_output_input_ratio,
-        stride_ms=stride_ms,
-        window_ms=window_ms,
-        max_freq=max_freq,
-        specgram_type=specgram_type,
-        feat_dim=feat_dim,
-        delta_delta=delta_delta,
-        use_dB_normalization=use_dB_normalization,
-        random_seed=random_seed,
-        keep_transcription_text=keep_transcription_text)
-
-    if dist:
-        batch_sampler = SortagradDistributedBatchSampler(
-            dataset,
-            batch_size,
-            num_replicas=None,
-            rank=None,
-            shuffle=is_training,
-            drop_last=is_training,
-            sortagrad=is_training,
-            shuffle_method=shuffle_method)
-    else:
-        batch_sampler = SortagradBatchSampler(
-            dataset,
-            shuffle=is_training,
-            batch_size=batch_size,
-            drop_last=is_training,
-            sortagrad=is_training,
-            shuffle_method=shuffle_method)
-
-    def padding_batch(batch,
-                      padding_to=-1,
-                      flatten=False,
-                      keep_transcription_text=True):
-        """	
-        Padding audio features with zeros to make them have the same shape (or	
-        a user-defined shape) within one bach.	
-
-        If ``padding_to`` is -1, the maximun shape in the batch will be used	
-        as the target shape for padding. Otherwise, `padding_to` will be the	
-        target shape (only refers to the second axis).	
-
-        If `flatten` is True, features will be flatten to 1darray.	
-        """
-        new_batch = []
-        # get target shape	
-        max_length = max([audio.shape[1] for audio, text in batch])
-        if padding_to != -1:
-            if padding_to < max_length:
-                raise ValueError("If padding_to is not -1, it should be larger "
-                                 "than any instance's shape in the batch")
-            max_length = padding_to
-        max_text_length = max([len(text) for audio, text in batch])
-        # padding	
-        padded_audios = []
-        audio_lens = []
-        texts, text_lens = [], []
-        for audio, text in batch:
-            padded_audio = np.zeros([audio.shape[0], max_length])
-            padded_audio[:, :audio.shape[1]] = audio
-            if flatten:
-                padded_audio = padded_audio.flatten()
-            padded_audios.append(padded_audio)
-            audio_lens.append(audio.shape[1])
-
-            padded_text = np.zeros([max_text_length])
-            if keep_transcription_text:
-                padded_text[:len(text)] = [ord(t) for t in text]  # string
-            else:
-                padded_text[:len(text)] = text  # ids
-            texts.append(padded_text)
-            text_lens.append(len(text))
-
-        padded_audios = np.array(padded_audios).astype('float32')
-        audio_lens = np.array(audio_lens).astype('int64')
-        texts = np.array(texts).astype('int32')
-        text_lens = np.array(text_lens).astype('int64')
-        return padded_audios, audio_lens, texts, text_lens
-
-    # collate_fn=functools.partial(padding_batch, keep_transcription_text=keep_transcription_text),
-    collate_fn = SpeechCollator(keep_transcription_text=keep_transcription_text)
-    loader = DataLoader(
-        dataset,
-        batch_sampler=batch_sampler,
-        collate_fn=collate_fn,
-        num_workers=num_workers)
-    return loader
--- a/deepspeech/io/batchfy.py
+++ b/deepspeech/io/batchfy.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import itertools
+
+import numpy as np
+
+from deepspeech.utils.log import Log
+
+__all__ = ["make_batchset"]
+
+logger = Log(__name__).getlog()
+
+
+def batchfy_by_seq(
+        sorted_data,
+        batch_size,
+        max_length_in,
+        max_length_out,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        iaxis=0,
+        okey="output",
+        oaxis=0, ):
+    """Make batch set from json dictionary
+
+    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
+    :param int batch_size: batch size
+    :param int max_length_in: maximum length of input to decide adaptive batch size
+    :param int max_length_out: maximum length of output to decide adaptive batch size
+    :param int min_batch_size: mininum batch size (for multi-gpu)
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+    :param str ikey: key to access input
+        (for ASR ikey="input", for TTS, MT ikey="output".)
+    :param int iaxis: dimension to access input
+        (for ASR, TTS iaxis=0, for MT iaxis="1".)
+    :param str okey: key to access output
+        (for ASR, MT okey="output". for TTS okey="input".)
+    :param int oaxis: dimension to access output
+        (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.)
+    :return: List[List[Tuple[str, dict]]] list of batches
+    """
+    if batch_size <= 0:
+        raise ValueError(f"Invalid batch_size={batch_size}")
+
+    # check #utts is more than min_batch_size
+    if len(sorted_data) < min_batch_size:
+        raise ValueError(
+            f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})."
+        )
+
+    # make list of minibatches
+    minibatches = []
+    start = 0
+    while True:
+        _, info = sorted_data[start]
+        ilen = int(info[ikey][iaxis]["shape"][0])
+        olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else
+                max(map(lambda x: int(x["shape"][0]), info[okey])))
+        factor = max(int(ilen / max_length_in), int(olen / max_length_out))
+        # change batchsize depending on the input and output length
+        # if ilen = 1000 and max_length_in = 800
+        # then b = batchsize / 2
+        # and max(min_batches, .) avoids batchsize = 0
+        bs = max(min_batch_size, int(batch_size / (1 + factor)))
+        end = min(len(sorted_data), start + bs)
+        minibatch = sorted_data[start:end]
+        if shortest_first:
+            minibatch.reverse()
+
+        # check each batch is more than minimum batchsize
+        if len(minibatch) < min_batch_size:
+            mod = min_batch_size - len(minibatch) % min_batch_size
+            additional_minibatch = [
+                sorted_data[i] for i in np.random.randint(0, start, mod)
+            ]
+            if shortest_first:
+                additional_minibatch.reverse()
+            minibatch.extend(additional_minibatch)
+        minibatches.append(minibatch)
+
+        if end == len(sorted_data):
+            break
+        start = end
+
+    # batch: List[List[Tuple[str, dict]]]
+    return minibatches
+
+
+def batchfy_by_bin(
+        sorted_data,
+        batch_bins,
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        okey="output", ):
+    """Make variably sized batch set, which maximizes
+
+    the number of bins up to `batch_bins`.
+
+    :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json
+    :param int batch_bins: Maximum frames of a batch
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param int test: Return only every `test` batches
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+
+    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
+    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
+
+    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
+    """
+    if batch_bins <= 0:
+        raise ValueError(f"invalid batch_bins={batch_bins}")
+    length = len(sorted_data)
+    idim = int(sorted_data[0][1][ikey][0]["shape"][1])
+    odim = int(sorted_data[0][1][okey][0]["shape"][1])
+    logger.info("# utts: " + str(len(sorted_data)))
+    minibatches = []
+    start = 0
+    n = 0
+    while True:
+        # Dynamic batch size depending on size of samples
+        b = 0
+        next_size = 0
+        max_olen = 0
+        while next_size < batch_bins and (start + b) < length:
+            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim
+            olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim
+            if olen > max_olen:
+                max_olen = olen
+            next_size = (max_olen + ilen) * (b + 1)
+            if next_size <= batch_bins:
+                b += 1
+            elif next_size == 0:
+                raise ValueError(
+                    f"Can't fit one sample in batch_bins ({batch_bins}): "
+                    f"Please increase the value")
+        end = min(length, start + max(min_batch_size, b))
+        batch = sorted_data[start:end]
+        if shortest_first:
+            batch.reverse()
+        minibatches.append(batch)
+        # Check for min_batch_size and fixes the batches if needed
+        i = -1
+        while len(minibatches[i]) < min_batch_size:
+            missing = min_batch_size - len(minibatches[i])
+            if -i == len(minibatches):
+                minibatches[i + 1].extend(minibatches[i])
+                minibatches = minibatches[1:]
+                break
+            else:
+                minibatches[i].extend(minibatches[i - 1][:missing])
+                minibatches[i - 1] = minibatches[i - 1][missing:]
+                i -= 1
+        if end == length:
+            break
+        start = end
+        n += 1
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+    lengths = [len(x) for x in minibatches]
+    logger.info(
+        str(len(minibatches)) + " batches containing from " + str(min(lengths))
+        + " to " + str(max(lengths)) + " samples " + "(avg " + str(
+            int(np.mean(lengths))) + " samples).")
+    return minibatches
+
+
+def batchfy_by_frame(
+        sorted_data,
+        max_frames_in,
+        max_frames_out,
+        max_frames_inout,
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        ikey="input",
+        okey="output", ):
+    """Make variable batch set, which maximizes the number of frames to max_batch_frame.
+
+    :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json
+    :param int max_frames_in: Maximum input frames of a batch
+    :param int max_frames_out: Maximum output frames of a batch
+    :param int max_frames_inout: Maximum input+output frames of a batch
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param int test: Return only every `test` batches
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+
+    :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".)
+    :param str okey: key to access output (for ASR okey="output". for TTS okey="input".)
+
+    :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches
+    """
+    if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0:
+        raise ValueError(
+            "At least, one of `--batch-frames-in`, `--batch-frames-out` or "
+            "`--batch-frames-inout` should be > 0")
+    length = len(sorted_data)
+    minibatches = []
+    start = 0
+    end = 0
+    while end != length:
+        # Dynamic batch size depending on size of samples
+        b = 0
+        max_olen = 0
+        max_ilen = 0
+        while (start + b) < length:
+            ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0])
+            if ilen > max_frames_in and max_frames_in != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-in ({max_frames_in}): "
+                    f"Please increase the value")
+            olen = int(sorted_data[start + b][1][okey][0]["shape"][0])
+            if olen > max_frames_out and max_frames_out != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-out ({max_frames_out}): "
+                    f"Please increase the value")
+            if ilen + olen > max_frames_inout and max_frames_inout != 0:
+                raise ValueError(
+                    f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): "
+                    f"Please increase the value")
+            max_olen = max(max_olen, olen)
+            max_ilen = max(max_ilen, ilen)
+            in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0
+            out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0
+            inout_ok = (max_ilen + max_olen) * (
+                b + 1) <= max_frames_inout or max_frames_inout == 0
+            if in_ok and out_ok and inout_ok:
+                # add more seq in the minibatch
+                b += 1
+            else:
+                # no more seq in the minibatch
+                break
+        end = min(length, start + b)
+        batch = sorted_data[start:end]
+        if shortest_first:
+            batch.reverse()
+        minibatches.append(batch)
+        # Check for min_batch_size and fixes the batches if needed
+        i = -1
+        while len(minibatches[i]) < min_batch_size:
+            missing = min_batch_size - len(minibatches[i])
+            if -i == len(minibatches):
+                minibatches[i + 1].extend(minibatches[i])
+                minibatches = minibatches[1:]
+                break
+            else:
+                minibatches[i].extend(minibatches[i - 1][:missing])
+                minibatches[i - 1] = minibatches[i - 1][missing:]
+                i -= 1
+        start = end
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+    lengths = [len(x) for x in minibatches]
+    logger.info(
+        str(len(minibatches)) + " batches containing from " + str(min(lengths))
+        + " to " + str(max(lengths)) + " samples" + "(avg " + str(
+            int(np.mean(lengths))) + " samples).")
+
+    return minibatches
+
+
+def batchfy_shuffle(data, batch_size, min_batch_size, num_batches,
+                    shortest_first):
+    import random
+
+    logger.info("use shuffled batch.")
+    sorted_data = random.sample(data.items(), len(data.items()))
+    logger.info("# utts: " + str(len(sorted_data)))
+    # make list of minibatches
+    minibatches = []
+    start = 0
+    while True:
+        end = min(len(sorted_data), start + batch_size)
+        # check each batch is more than minimum batchsize
+        minibatch = sorted_data[start:end]
+        if shortest_first:
+            minibatch.reverse()
+        if len(minibatch) < min_batch_size:
+            mod = min_batch_size - len(minibatch) % min_batch_size
+            additional_minibatch = [
+                sorted_data[i] for i in np.random.randint(0, start, mod)
+            ]
+            if shortest_first:
+                additional_minibatch.reverse()
+            minibatch.extend(additional_minibatch)
+        minibatches.append(minibatch)
+        if end == len(sorted_data):
+            break
+        start = end
+
+    # for debugging
+    if num_batches > 0:
+        minibatches = minibatches[:num_batches]
+        logger.info("# minibatches: " + str(len(minibatches)))
+    return minibatches
+
+
+BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"]
+BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"]
+
+
+def make_batchset(
+        data,
+        batch_size=0,
+        max_length_in=float("inf"),
+        max_length_out=float("inf"),
+        num_batches=0,
+        min_batch_size=1,
+        shortest_first=False,
+        batch_sort_key="input",
+        count="auto",
+        batch_bins=0,
+        batch_frames_in=0,
+        batch_frames_out=0,
+        batch_frames_inout=0,
+        iaxis=0,
+        oaxis=0, ):
+    """Make batch set from json dictionary
+
+    if utts have "category" value,
+
+        >>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'},
+        ...         {'category': 'B', 'input': ..., 'utt':'utt2'},
+        ...         {'category': 'B', 'input': ..., 'utt':'utt3'},
+        ...         {'category': 'A', 'input': ..., 'utt':'utt4'}]
+        >>> make_batchset(data, batchsize=2, ...)
+        [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]]
+
+    Note that if any utts doesn't have "category",
+    perform as same as batchfy_by_{count}
+
+    :param List[Dict[str, Any]] data: dictionary loaded from data.json
+    :param int batch_size: maximum number of sequences in a minibatch.
+    :param int batch_bins: maximum number of bins (frames x dim) in a minibatch.
+    :param int batch_frames_in:  maximum number of input frames in a minibatch.
+    :param int batch_frames_out: maximum number of output frames in a minibatch.
+    :param int batch_frames_out: maximum number of input+output frames in a minibatch.
+    :param str count: strategy to count maximum size of batch.
+        For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES
+
+    :param int max_length_in: maximum length of input to decide adaptive batch size
+    :param int max_length_out: maximum length of output to decide adaptive batch size
+    :param int num_batches: # number of batches to use (for debug)
+    :param int min_batch_size: minimum batch size (for multi-gpu)
+    :param bool shortest_first: Sort from batch with shortest samples
+        to longest if true, otherwise reverse
+    :param str batch_sort_key: how to sort data before creating minibatches
+        ["input", "output", "shuffle"]
+    :param bool swap_io: if True, use "input" as output and "output"
+        as input in `data` dict
+    :param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output"
+        as input in `data` dict
+    :param int iaxis: dimension to access input
+        (for ASR, TTS iaxis=0, for MT iaxis="1".)
+    :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0,
+        reserved for future research, -1 means all axis.)
+    :return: List[List[Tuple[str, dict]]] list of batches
+    """
+    # check args
+    if count not in BATCH_COUNT_CHOICES:
+        raise ValueError(
+            f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}")
+    if batch_sort_key not in BATCH_SORT_KEY_CHOICES:
+        raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be "
+                         f"one of {BATCH_SORT_KEY_CHOICES}")
+
+    ikey = "input"
+    okey = "output"
+    batch_sort_axis = 0  # index of list 
+    if count == "auto":
+        if batch_size != 0:
+            count = "seq"
+        elif batch_bins != 0:
+            count = "bin"
+        elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0:
+            count = "frame"
+        else:
+            raise ValueError(
+                f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}"
+            )
+        logger.info(f"count is auto detected as {count}")
+
+    if count != "seq" and batch_sort_key == "shuffle":
+        raise ValueError(
+            "batch_sort_key=shuffle is only available if batch_count=seq")
+
+    category2data = {}  # Dict[str, dict]
+    for v in data:
+        k = v['utt']
+        category2data.setdefault(v.get("category"), {})[k] = v
+
+    batches_list = []  # List[List[List[Tuple[str, dict]]]]
+    for d in category2data.values():
+        if batch_sort_key == "shuffle":
+            batches = batchfy_shuffle(d, batch_size, min_batch_size,
+                                      num_batches, shortest_first)
+            batches_list.append(batches)
+            continue
+
+        # sort it by input lengths (long to short)
+        sorted_data = sorted(
+            d.items(),
+            key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]),
+            reverse=not shortest_first, )
+        logger.info("# utts: " + str(len(sorted_data)))
+
+        if count == "seq":
+            batches = batchfy_by_seq(
+                sorted_data,
+                batch_size=batch_size,
+                max_length_in=max_length_in,
+                max_length_out=max_length_out,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                iaxis=iaxis,
+                okey=okey,
+                oaxis=oaxis, )
+        if count == "bin":
+            batches = batchfy_by_bin(
+                sorted_data,
+                batch_bins=batch_bins,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                okey=okey, )
+        if count == "frame":
+            batches = batchfy_by_frame(
+                sorted_data,
+                max_frames_in=batch_frames_in,
+                max_frames_out=batch_frames_out,
+                max_frames_inout=batch_frames_inout,
+                min_batch_size=min_batch_size,
+                shortest_first=shortest_first,
+                ikey=ikey,
+                okey=okey, )
+        batches_list.append(batches)
+
+    if len(batches_list) == 1:
+        batches = batches_list[0]
+    else:
+        # Concat list. This way is faster than "sum(batch_list, [])"
+        batches = list(itertools.chain(*batches_list))
+
+    # for debugging
+    if num_batches > 0:
+        batches = batches[:num_batches]
+    logger.info("# minibatches: " + str(len(batches)))
+
+    # batch: List[List[Tuple[str, dict]]]
+    return batches
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -23,7 +23,7 @@ from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
 from deepspeech.frontend.normalizer import FeatureNormalizer
 from deepspeech.frontend.speech import SpeechSegment
 from deepspeech.frontend.utility import IGNORE_ID
-from deepspeech.io.utility import pad_sequence
+from deepspeech.io.utility import pad_list
 from deepspeech.utils.log import Log

 __all__ = ["SpeechCollator"]
@@ -242,7 +242,6 @@ class SpeechCollator():

        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        specgram = specgram.transpose([1, 0])
        return specgram, transcript_part

    def __call__(self, batch):
@@ -250,7 +249,7 @@ class SpeechCollator():

        Args:
            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (D, T)
+                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)

        Returns:
@@ -286,13 +285,12 @@ class SpeechCollator():
            texts.append(tokens)
            text_lens.append(tokens.shape[0])

-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_texts = pad_sequence(
-            texts, padding_value=IGNORE_ID).astype(np.int64)
-        text_lens = np.array(text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, padded_texts, text_lens
+        #[B, T, D]
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)
+        ilens = np.array(audio_lens).astype(np.int64)
+        ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64)
+        olens = np.array(text_lens).astype(np.int64)
+        return utts, xs_pad, ilens, ys_pad, olens

    @property
    def manifest(self):

--- a/deepspeech/io/collator_st.py
+++ b/deepspeech/io/collator_st.py
@@ -217,6 +217,34 @@ class SpeechCollator():
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])

+    @property
+    def manifest(self):
+        return self._manifest
+
+    @property
+    def vocab_size(self):
+        return self._speech_featurizer.vocab_size
+
+    @property
+    def vocab_list(self):
+        return self._speech_featurizer.vocab_list
+
+    @property
+    def vocab_dict(self):
+        return self._speech_featurizer.vocab_dict
+
+    @property
+    def text_feature(self):
+        return self._speech_featurizer.text_feature
+
+    @property
+    def feature_size(self):
+        return self._speech_featurizer.feature_size
+
+    @property
+    def stride_ms(self):
+        return self._speech_featurizer.stride_ms
+
    def process_utterance(self, audio_file, translation):
        """Load, augment, featurize and normalize for speech data.

@@ -244,7 +272,6 @@ class SpeechCollator():

        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        specgram = specgram.transpose([1, 0])
        return specgram, translation_part

    def __call__(self, batch):
@@ -252,7 +279,7 @@ class SpeechCollator():

        Args:
            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (D, T)
+                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)

        Returns:
@@ -296,34 +323,6 @@ class SpeechCollator():
        text_lens = np.array(text_lens).astype(np.int64)
        return utts, padded_audios, audio_lens, padded_texts, text_lens

-    @property
-    def manifest(self):
-        return self._manifest
-
-    @property
-    def vocab_size(self):
-        return self._speech_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        return self._speech_featurizer.vocab_list
-
-    @property
-    def vocab_dict(self):
-        return self._speech_featurizer.vocab_dict
-
-    @property
-    def text_feature(self):
-        return self._speech_featurizer.text_feature
-
-    @property
-    def feature_size(self):
-        return self._speech_featurizer.feature_size
-
-    @property
-    def stride_ms(self):
-        return self._speech_featurizer.stride_ms
-

 class TripletSpeechCollator(SpeechCollator):
    def process_utterance(self, audio_file, translation, transcript):
@@ -355,7 +354,6 @@ class TripletSpeechCollator(SpeechCollator):

        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        specgram = specgram.transpose([1, 0])
        return specgram, translation_part, transcript_part

    def __call__(self, batch):
@@ -363,7 +361,7 @@ class TripletSpeechCollator(SpeechCollator):

        Args:
            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (D, T)
+                audio (np.ndarray) shape (T, D)
                text (List[int] or str): shape (U,)

        Returns:
@@ -524,49 +522,19 @@ class KaldiPrePorocessedCollator(SpeechCollator):
        :rtype: tuple of (2darray, list)
        """
        specgram = kaldiio.load_mat(audio_file)
-        specgram = specgram.transpose([1, 0])
        assert specgram.shape[
-            0] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
-                self._feat_dim, specgram.shape[0])
+            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
+                self._feat_dim, specgram.shape[1])

        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)

-        specgram = specgram.transpose([1, 0])
        if self._keep_transcription_text:
            return specgram, translation
        else:
            text_ids = self._text_featurizer.featurize(translation)
            return specgram, text_ids

-    @property
-    def manifest(self):
-        return self._manifest
-
-    @property
-    def vocab_size(self):
-        return self._text_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        return self._text_featurizer.vocab_list
-
-    @property
-    def vocab_dict(self):
-        return self._text_featurizer.vocab_dict
-
-    @property
-    def text_feature(self):
-        return self._text_featurizer
-
-    @property
-    def feature_size(self):
-        return self._feat_dim
-
-    @property
-    def stride_ms(self):
-        return self._stride_ms
-

 class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
    def process_utterance(self, audio_file, translation, transcript):
@@ -583,15 +551,13 @@ class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):
        :rtype: tuple of (2darray, (list, list))
        """
        specgram = kaldiio.load_mat(audio_file)
-        specgram = specgram.transpose([1, 0])
        assert specgram.shape[
-            0] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
-                self._feat_dim, specgram.shape[0])
+            1] == self._feat_dim, 'expect feat dim {}, but got {}'.format(
+                self._feat_dim, specgram.shape[1])

        # specgram augment
        specgram = self._augmentation_pipeline.transform_feature(specgram)

-        specgram = specgram.transpose([1, 0])
        if self._keep_transcription_text:
            return specgram, translation, transcript
        else:
@@ -604,7 +570,7 @@ class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator):

        Args:
            batch ([List]): batch is (audio, text)
-                audio (np.ndarray) shape (D, T)
+                audio (np.ndarray) shape (T, D)
                translation (List[int] or str): shape (U,)
                transcription (List[int] or str): shape (V,)


--- a/deepspeech/io/converter.py
+++ b/deepspeech/io/converter.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from deepspeech.io.utility import pad_list
+from deepspeech.utils.log import Log
+
+__all__ = ["CustomConverter"]
+
+logger = Log(__name__).getlog()
+
+
+class CustomConverter():
+    """Custom batch converter.
+
+    Args:
+        subsampling_factor (int): The subsampling factor.
+        dtype (np.dtype): Data type to convert.
+        
+    """
+
+    def __init__(self, subsampling_factor=1, dtype=np.float32):
+        """Construct a CustomConverter object."""
+        self.subsampling_factor = subsampling_factor
+        self.ignore_id = -1
+        self.dtype = dtype
+
+    def __call__(self, batch):
+        """Transform a batch and send it to a device.
+
+        Args:
+            batch (list): The batch to transform.
+
+        Returns:
+            tuple(paddle.Tensor, paddle.Tensor, paddle.Tensor)
+
+        """
+        # batch should be located in list
+        assert len(batch) == 1
+        (xs, ys), utts = batch[0]
+        assert xs[0] is not None, "please check Reader and Augmentation impl."
+
+        # perform subsampling
+        if self.subsampling_factor > 1:
+            xs = [x[::self.subsampling_factor, :] for x in xs]
+
+        # get batch of lengths of input sequences
+        ilens = np.array([x.shape[0] for x in xs])
+
+        # perform padding and convert to tensor
+        # currently only support real number
+        if xs[0].dtype.kind == "c":
+            xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype)
+            xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype)
+            # Note(kamo):
+            # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E.
+            # Don't create ComplexTensor and give it E2E here
+            # because torch.nn.DataParellel can't handle it.
+            xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag}
+        else:
+            xs_pad = pad_list(xs, 0).astype(self.dtype)
+
+        # NOTE: this is for multi-output (e.g., speech translation)
+        ys_pad = pad_list(
+            [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys],
+            self.ignore_id)
+
+        olens = np.array(
+            [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys])
+        return utts, xs_pad, ilens, ys_pad, olens
--- a/deepspeech/io/dataloader.py
+++ b/deepspeech/io/dataloader.py
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Any
+from typing import Dict
+from typing import List
+from typing import Text
+
+import numpy as np
+from paddle.io import DataLoader
+
+from deepspeech.frontend.utility import read_manifest
+from deepspeech.io.batchfy import make_batchset
+from deepspeech.io.converter import CustomConverter
+from deepspeech.io.dataset import TransformDataset
+from deepspeech.io.reader import LoadInputsAndTargets
+from deepspeech.utils.log import Log
+
+__all__ = ["BatchDataLoader"]
+
+logger = Log(__name__).getlog()
+
+
+def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]],
+                            mode: Text="asr",
+                            iaxis=0,
+                            oaxis=0):
+    if mode == 'asr':
+        feat_dim = data_json[0]['input'][oaxis]['shape'][1]
+        vocab_size = data_json[0]['output'][oaxis]['shape'][1]
+    else:
+        raise ValueError(f"{mode} mode not support!")
+    return feat_dim, vocab_size
+
+
+class BatchDataLoader():
+    def __init__(self,
+                 json_file: str,
+                 train_mode: bool,
+                 sortagrad: bool=False,
+                 batch_size: int=0,
+                 maxlen_in: float=float('inf'),
+                 maxlen_out: float=float('inf'),
+                 minibatches: int=0,
+                 mini_batch_size: int=1,
+                 batch_count: str='auto',
+                 batch_bins: int=0,
+                 batch_frames_in: int=0,
+                 batch_frames_out: int=0,
+                 batch_frames_inout: int=0,
+                 preprocess_conf=None,
+                 n_iter_processes: int=1,
+                 subsampling_factor: int=1,
+                 num_encs: int=1):
+        self.json_file = json_file
+        self.train_mode = train_mode
+        self.use_sortagrad = sortagrad == -1 or sortagrad > 0
+        self.batch_size = batch_size
+        self.maxlen_in = maxlen_in
+        self.maxlen_out = maxlen_out
+        self.batch_count = batch_count
+        self.batch_bins = batch_bins
+        self.batch_frames_in = batch_frames_in
+        self.batch_frames_out = batch_frames_out
+        self.batch_frames_inout = batch_frames_inout
+        self.subsampling_factor = subsampling_factor
+        self.num_encs = num_encs
+        self.preprocess_conf = preprocess_conf
+        self.n_iter_processes = n_iter_processes
+
+        # read json data
+        self.data_json = read_manifest(json_file)
+        self.feat_dim, self.vocab_size = feat_dim_and_vocab_size(
+            self.data_json, mode='asr')
+
+        # make minibatch list (variable length)
+        self.minibaches = make_batchset(
+            self.data_json,
+            batch_size,
+            maxlen_in,
+            maxlen_out,
+            minibatches,  # for debug
+            min_batch_size=mini_batch_size,
+            shortest_first=self.use_sortagrad,
+            count=batch_count,
+            batch_bins=batch_bins,
+            batch_frames_in=batch_frames_in,
+            batch_frames_out=batch_frames_out,
+            batch_frames_inout=batch_frames_inout,
+            iaxis=0,
+            oaxis=0, )
+
+        # data reader
+        self.reader = LoadInputsAndTargets(
+            mode="asr",
+            load_output=True,
+            preprocess_conf=preprocess_conf,
+            preprocess_args={"train":
+                             train_mode},  # Switch the mode of preprocessing
+        )
+
+        # Setup a converter
+        if num_encs == 1:
+            self.converter = CustomConverter(
+                subsampling_factor=subsampling_factor, dtype=np.float32)
+        else:
+            assert NotImplementedError("not impl CustomConverterMulEnc.")
+
+        # hack to make batchsize argument as 1
+        # actual bathsize is included in a list
+        # default collate function converts numpy array to pytorch tensor
+        # we used an empty collate function instead which returns list
+        self.dataset = TransformDataset(
+            self.minibaches,
+            lambda data: self.converter([self.reader(data, return_uttid=True)]))
+        self.dataloader = DataLoader(
+            dataset=self.dataset,
+            batch_size=1,
+            shuffle=not self.use_sortagrad if train_mode else False,
+            collate_fn=lambda x: x[0],
+            num_workers=n_iter_processes, )
+
+    def __repr__(self):
+        echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> "
+        echo += f"train_mode: {self.train_mode}, "
+        echo += f"sortagrad: {self.use_sortagrad}, "
+        echo += f"batch_size: {self.batch_size}, "
+        echo += f"maxlen_in: {self.maxlen_in}, "
+        echo += f"maxlen_out: {self.maxlen_out}, "
+        echo += f"batch_count: {self.batch_count}, "
+        echo += f"batch_bins: {self.batch_bins}, "
+        echo += f"batch_frames_in: {self.batch_frames_in}, "
+        echo += f"batch_frames_out: {self.batch_frames_out}, "
+        echo += f"batch_frames_inout: {self.batch_frames_inout}, "
+        echo += f"subsampling_factor: {self.subsampling_factor}, "
+        echo += f"num_encs: {self.num_encs}, "
+        echo += f"num_workers: {self.n_iter_processes}, "
+        echo += f"file: {self.json_file}"
+        return echo
+
+    def __len__(self):
+        return len(self.dataloader)
+
+    def __iter__(self):
+        return self.dataloader.__iter__()
+
+    def __call__(self):
+        return self.__iter__()
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log

-__all__ = ["ManifestDataset", "TripletManifestDataset"]
+__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]

 logger = Log(__name__).getlog()

@@ -76,12 +76,18 @@ class ManifestDataset(Dataset):

        Args:
            manifest_path (str): manifest josn file path
-            max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
-            min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
-            max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
-            min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
-            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
-            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
+            max_input_len ([type], optional): maximum output seq length, 
+                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
+            min_input_len (float, optional): minimum input seq length, 
+                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
+            max_output_len (float, optional): maximum input seq length, 
+                in modeling units. Defaults to 500.0.
+            min_output_len (float, optional): minimum input seq length, 
+                in modeling units. Defaults to 0.0.
+            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. 
+                Defaults to 10.0.
+            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
+                Defaults to 0.05.
        
        """
        super().__init__()
@@ -116,3 +122,27 @@ class TripletManifestDataset(ManifestDataset):
        instance = self._manifest[idx]
        return instance["utt"], instance["feat"], instance["text"], instance[
            "text1"]
+
+
+class TransformDataset(Dataset):
+    """Transform Dataset.
+
+    Args:
+        data: list object from make_batchset
+        transfrom: transform function
+
+    """
+
+    def __init__(self, data, transform):
+        """Init function."""
+        super().__init__()
+        self.data = data
+        self.transform = transform
+
+    def __len__(self):
+        """Len function."""
+        return len(self.data)
+
+    def __getitem__(self, idx):
+        """[] operator."""
+        return self.transform(self.data[idx])
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
--- a/deepspeech/io/utility.py
+++ b/deepspeech/io/utility.py
@@ -17,11 +17,16 @@ import numpy as np

 from deepspeech.utils.log import Log

-__all__ = ["pad_sequence"]
+__all__ = ["pad_list", "pad_sequence"]

 logger = Log(__name__).getlog()


+def pad_list(sequences: List[np.ndarray],
+             padding_value: float=0.0) -> np.ndarray:
+    return pad_sequence(sequences, True, padding_value)
+
+
 def pad_sequence(sequences: List[np.ndarray],
                 batch_first: bool=True,
                 padding_value: float=0.0) -> np.ndarray:

--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
@@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
                        share_weights=share_rnn_weights))
            i_size = h_size * 2

-        self.rnn_stacks = nn.ModuleList(rnn_stacks)
+        self.rnn_stacks = nn.LayerList(rnn_stacks)

    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """

--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -54,7 +54,7 @@ __all__ = ["U2Model", "U2InferModel"]
 logger = Log(__name__).getlog()


-class U2BaseModel(nn.Module):
+class U2BaseModel(nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""

    @classmethod
@@ -612,32 +612,32 @@ class U2BaseModel(nn.Module):
                best_index = i
        return hyps[best_index][0]

-    #@jit.export
+    #@jit.to_static
    def subsampling_rate(self) -> int:
        """ Export interface for c++ call, return subsampling_rate of the
            model
        """
        return self.encoder.embed.subsampling_rate

-    #@jit.export
+    #@jit.to_static
    def right_context(self) -> int:
        """ Export interface for c++ call, return right_context of the model
        """
        return self.encoder.embed.right_context

-    #@jit.export
+    #@jit.to_static
    def sos_symbol(self) -> int:
        """ Export interface for c++ call, return sos symbol id of the model
        """
        return self.sos

-    #@jit.export
+    #@jit.to_static
    def eos_symbol(self) -> int:
        """ Export interface for c++ call, return eos symbol id of the model
        """
        return self.eos

-    @jit.export
+    @jit.to_static
    def forward_encoder_chunk(
            self,
            xs: paddle.Tensor,
@@ -667,7 +667,7 @@ class U2BaseModel(nn.Module):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)

-    # @jit.export([
+    # @jit.to_static([
    #         paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'),  # audio feat, [B,T,D]
    #     ])
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
@@ -680,7 +680,7 @@ class U2BaseModel(nn.Module):
        """
        return self.ctc.log_softmax(xs)

-    @jit.export
+    @jit.to_static
    def forward_attention_decoder(
            self,
            hyps: paddle.Tensor,

--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@@ -48,7 +48,7 @@ __all__ = ["U2STModel", "U2STInferModel"]
 logger = Log(__name__).getlog()


-class U2STBaseModel(nn.Module):
+class U2STBaseModel(nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""

    @classmethod
@@ -417,32 +417,32 @@ class U2STBaseModel(nn.Module):
        best_hyps = best_hyps[:, 1:]
        return best_hyps

-    @jit.export
+    @jit.to_static
    def subsampling_rate(self) -> int:
        """ Export interface for c++ call, return subsampling_rate of the
            model
        """
        return self.encoder.embed.subsampling_rate

-    @jit.export
+    @jit.to_static
    def right_context(self) -> int:
        """ Export interface for c++ call, return right_context of the model
        """
        return self.encoder.embed.right_context

-    @jit.export
+    @jit.to_static
    def sos_symbol(self) -> int:
        """ Export interface for c++ call, return sos symbol id of the model
        """
        return self.sos

-    @jit.export
+    @jit.to_static
    def eos_symbol(self) -> int:
        """ Export interface for c++ call, return eos symbol id of the model
        """
        return self.eos

-    @jit.export
+    @jit.to_static
    def forward_encoder_chunk(
            self,
            xs: paddle.Tensor,
@@ -472,7 +472,7 @@ class U2STBaseModel(nn.Module):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)

-    @jit.export
+    @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@@ -483,7 +483,7 @@ class U2STBaseModel(nn.Module):
        """
        return self.ctc.log_softmax(xs)

-    @jit.export
+    @jit.to_static
    def forward_attention_decoder(
            self,
            hyps: paddle.Tensor,

--- a/deepspeech/modules/activation.py
+++ b/deepspeech/modules/activation.py
@@ -69,7 +69,7 @@ class ConvGLUBlock(nn.Layer):
                dim=0)
            self.dropout_residual = nn.Dropout(p=dropout)

-        self.pad_left = ConstantPad2d((0, 0, kernel_size - 1, 0), 0)
+        self.pad_left = nn.Pad2d((0, 0, kernel_size - 1, 0), 0)

        layers = OrderedDict()
        if bottlececk_dim == 0:

--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@@ -33,7 +33,7 @@ logger = Log(__name__).getlog()
 __all__ = ["TransformerDecoder"]


-class TransformerDecoder(nn.Module):
+class TransformerDecoder(nn.Layer):
    """Base class of Transfomer decoder module.
    Args:
        vocab_size: output dim
@@ -86,7 +86,7 @@ class TransformerDecoder(nn.Module):
        self.use_output_layer = use_output_layer
        self.output_layer = nn.Linear(attention_dim, vocab_size)

-        self.decoders = nn.ModuleList([
+        self.decoders = nn.LayerList([
            DecoderLayer(
                size=attention_dim,
                self_attn=MultiHeadedAttention(attention_heads, attention_dim,

--- a/deepspeech/modules/decoder_layer.py
+++ b/deepspeech/modules/decoder_layer.py
@@ -25,15 +25,15 @@ logger = Log(__name__).getlog()
 __all__ = ["DecoderLayer"]


-class DecoderLayer(nn.Module):
+class DecoderLayer(nn.Layer):
    """Single decoder layer module.
    Args:
        size (int): Input dimension.
-        self_attn (nn.Module): Self-attention module instance.
+        self_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
-        src_attn (nn.Module): Self-attention module instance.
+        src_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
-        feed_forward (nn.Module): Feed-forward module instance.
+        feed_forward (nn.Layer): Feed-forward module instance.
            `PositionwiseFeedForward` instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool):
@@ -48,9 +48,9 @@ class DecoderLayer(nn.Module):
    def __init__(
            self,
            size: int,
-            self_attn: nn.Module,
-            src_attn: nn.Module,
-            feed_forward: nn.Module,
+            self_attn: nn.Layer,
+            src_attn: nn.Layer,
+            feed_forward: nn.Layer,
            dropout_rate: float,
            normalize_before: bool=True,
            concat_after: bool=False, ):

--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@@ -358,7 +358,7 @@ class TransformerEncoder(BaseEncoder):
                         pos_enc_layer_type, normalize_before, concat_after,
                         static_chunk_size, use_dynamic_chunk, global_cmvn,
                         use_dynamic_left_chunk)
-        self.encoders = nn.ModuleList([
+        self.encoders = nn.LayerList([
            TransformerEncoderLayer(
                size=output_size,
                self_attn=MultiHeadedAttention(attention_heads, output_size,
@@ -438,7 +438,7 @@ class ConformerEncoder(BaseEncoder):
        convolution_layer_args = (output_size, cnn_module_kernel, activation,
                                  cnn_module_norm, causal)

-        self.encoders = nn.ModuleList([
+        self.encoders = nn.LayerList([
            ConformerEncoderLayer(
                size=output_size,
                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),

--- a/deepspeech/modules/loss.py
+++ b/deepspeech/modules/loss.py
@@ -48,7 +48,8 @@ class CTCLoss(nn.Layer):
        logits = logits.transpose([1, 0, 2])
        # (TODO:Hui Zhang) ctc loss does not support int64 labels
        ys_pad = ys_pad.astype(paddle.int32)
-        loss = self.loss(logits, ys_pad, hlens, ys_lens)
+        loss = self.loss(
+            logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average)
        if self.batch_average:
            # Batch-size average
            loss = loss / B

--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
                        share_weights=share_rnn_weights))
            i_size = h_size * 2

-        self.rnn_stacks = nn.ModuleList(rnn_stacks)
+        self.rnn_stacks = nn.LayerList(rnn_stacks)

    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """

--- a/deepspeech/training/cli.py
+++ b/deepspeech/training/cli.py
@@ -47,18 +47,11 @@ def default_argument_parser():
    # data and output
    parser.add_argument("--config", metavar="FILE", help="path of the config file to overwrite to default config with.")
    parser.add_argument("--dump-config", metavar="FILE", help="dump config to yaml file.")
-    # parser.add_argument("--data", metavar="DATA_DIR", help="path to the datatset.")
    parser.add_argument("--output", metavar="OUTPUT_DIR", help="path to save checkpoint and logs.")

    # load from saved checkpoint
    parser.add_argument("--checkpoint_path", type=str, help="path of the checkpoint to load")

-    # save jit model to
-    parser.add_argument("--export_path", type=str, help="path of the jit model to save")
-
-    # save asr result to
-    parser.add_argument("--result_file", type=str, help="path of save the asr result")
-
    # running
    parser.add_argument("--device", type=str, default='gpu', choices=["cpu", "gpu"],
                        help="device type to use, cpu and gpu are supported.")

--- a/deepspeech/training/optimizer.py
+++ b/deepspeech/training/optimizer.py
--- a/deepspeech/training/scheduler.py
+++ b/deepspeech/training/scheduler.py
--- a/doc/src/feature_list.md
+++ b/doc/src/feature_list.md
-# Featrues
+# Features

 ### Speech Recognition


--- a/env.sh
+++ b/env.sh
 export MAIN_ROOT=${PWD}

-export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:/usr/local/bin:${PATH}
 export LC_ALL=C

 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C

--- a/examples/aishell/s0/README.md
+++ b/examples/aishell/s0/README.md
--- a/examples/aishell/s0/conf/augmentation.json
+++ b/examples/aishell/s0/conf/augmentation.json
--- a/examples/aishell/s0/local/test_export.sh
+++ b/examples/aishell/s0/local/test_export.sh
@@ -26,7 +26,7 @@ python3 -u ${BIN_DIR}/test_export.py \
 --device ${device} \
 --nproc 1 \
 --config ${config_path} \
--result_file ${ckpt_prefix}.rsl \
+--result_file ${jit_model_export_path}.rsl \
 --export_path ${jit_model_export_path} \
 --model_type ${model_type}


--- a/examples/aishell/s1/conf/augmentation.json
+++ b/examples/aishell/s1/conf/augmentation.json
--- a/examples/aug_conf/augmentation.json
+++ b/examples/aug_conf/augmentation.json
--- a/examples/aug_conf/augmentation.example.json
+++ b/examples/aug_conf/augmentation.example.json
--- a/examples/callcenter/s1/conf/augmentation.json
+++ b/examples/callcenter/s1/conf/augmentation.json
--- a/examples/librispeech/s0/conf/augmentation.json
+++ b/examples/librispeech/s0/conf/augmentation.json
--- a/examples/librispeech/s1/README.md
+++ b/examples/librispeech/s1/README.md
--- a/examples/librispeech/s1/conf/augmentation.json
+++ b/examples/librispeech/s1/conf/augmentation.json
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
--- a/examples/librispeech/s1/run.sh
+++ b/examples/librispeech/s1/run.sh
--- a/examples/librispeech/s2/conf/augmentation.json
+++ b/examples/librispeech/s2/conf/augmentation.json
--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
--- a/examples/librispeech/s2/local/align.sh
+++ b/examples/librispeech/s2/local/align.sh
--- a/examples/librispeech/s2/local/espnet_json_to_manifest.py
+++ b/examples/librispeech/s2/local/espnet_json_to_manifest.py
--- a/examples/librispeech/s2/local/export.sh
+++ b/examples/librispeech/s2/local/export.sh
--- a/examples/librispeech/s2/local/test.sh
+++ b/examples/librispeech/s2/local/test.sh
--- a/examples/librispeech/s2/local/train.sh
+++ b/examples/librispeech/s2/local/train.sh
@@ -25,6 +25,7 @@ if [ ${seed} ]; then
 fi

 python3 -u ${BIN_DIR}/train.py \
+--model-name u2_kaldi \
 --device ${device} \
 --nproc ${ngpu} \
 --config ${config_path} \

--- a/examples/librispeech/s2/path.sh
+++ b/examples/librispeech/s2/path.sh
--- a/examples/librispeech/s2/run.sh
+++ b/examples/librispeech/s2/run.sh
--- a/examples/punctuation_restoration/README.md
+++ b/examples/punctuation_restoration/README.md
--- a/examples/ted_en_zh/t0/.gitignore
+++ b/examples/ted_en_zh/t0/.gitignore
--- a/examples/ted_en_zh/t0/README.md
+++ b/examples/ted_en_zh/t0/README.md
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
--- a/examples/ted_en_zh/t0/run.sh
+++ b/examples/ted_en_zh/t0/run.sh
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
--- a/examples/timit/s1/conf/augmentation.json
+++ b/examples/timit/s1/conf/augmentation.json
--- a/examples/tiny/s0/conf/augmentation.json
+++ b/examples/tiny/s0/conf/augmentation.json
--- a/examples/tiny/s1/conf/augmentation.json
+++ b/examples/tiny/s1/conf/augmentation.json
--- a/requirements.txt
+++ b/requirements.txt
--- a/speechnn/.gitignore
+++ b/speechnn/.gitignore
--- a/speechnn/CMakeLists.txt
+++ b/speechnn/CMakeLists.txt
--- a/speechnn/cmake/third_party.cmake
+++ b/speechnn/cmake/third_party.cmake
--- a/speechnn/cmake/third_party/absl.cmake
+++ b/speechnn/cmake/third_party/absl.cmake
--- a/speechnn/cmake/third_party/boost.cmake
+++ b/speechnn/cmake/third_party/boost.cmake
--- a/speechnn/cmake/third_party/eigen.cmake
+++ b/speechnn/cmake/third_party/eigen.cmake
--- a/speechnn/cmake/third_party/libsndfile.cmake
+++ b/speechnn/cmake/third_party/libsndfile.cmake
--- a/speechnn/cmake/third_party/openfst.cmake
+++ b/speechnn/cmake/third_party/openfst.cmake
--- a/speechnn/cmake/third_party/openfst_lib_target.cmake
+++ b/speechnn/cmake/third_party/openfst_lib_target.cmake
--- a/speechnn/cmake/third_party/threadpool.cmake
+++ b/speechnn/cmake/third_party/threadpool.cmake
--- a/speechnn/cmake/third_party/version.cmake
+++ b/speechnn/cmake/third_party/version.cmake
--- a/speechnn/core/CMakeLists.txt
+++ b/speechnn/core/CMakeLists.txt
--- a/speechnn/core/transformers/README.md
+++ b/speechnn/core/transformers/README.md
--- a/speechnn/core/frontend/CMakeLists.txt
+++ b/speechnn/core/frontend/CMakeLists.txt
--- a/speechnn/core/frontend/audio/CMakeLists.txt
+++ b/speechnn/core/frontend/audio/CMakeLists.txt
--- a/speechnn/core/frontend/text/CMakeLists.txt
+++ b/speechnn/core/frontend/text/CMakeLists.txt
--- a/speechnn/core/decoder/CMakeLists.txt
+++ b/speechnn/core/decoder/CMakeLists.txt
--- a/speechnn/core/model/CMakeLists.txt
+++ b/speechnn/core/model/CMakeLists.txt
--- a/speechnn/core/protocol/CMakeLists.txt
+++ b/speechnn/core/protocol/CMakeLists.txt
--- a/speechnn/core/utils/CMakeLists.txt
+++ b/speechnn/core/utils/CMakeLists.txt
--- a/speechnn/third_party/CMakeLists.txt
+++ b/speechnn/third_party/CMakeLists.txt
--- a/speechnn/speechnn/nn/CMakeLists.txt
+++ b/speechnn/speechnn/nn/CMakeLists.txt
--- a/speechnn/speechnn/protocol/CMakeLists.txt
+++ b/speechnn/speechnn/protocol/CMakeLists.txt
--- a/speechnn/speechnn/utils/CMakeLists.txt
+++ b/speechnn/speechnn/utils/CMakeLists.txt
--- a/tests/chains/ds2_params_lite_train_infer.txt
+++ b/tests/chains/ds2_params_lite_train_infer.txt
--- a/tests/chains/ds2_params_whole_train_infer.txt
+++ b/tests/chains/ds2_params_whole_train_infer.txt
--- a/tests/chains/lite_train_infer.sh
+++ b/tests/chains/lite_train_infer.sh
--- a/tests/chains/prepare.sh
+++ b/tests/chains/prepare.sh
--- a/tests/chains/test.sh
+++ b/tests/chains/test.sh
--- a/tests/chains/whole_train_infer.sh
+++ b/tests/chains/whole_train_infer.sh
--- a/tools/extras/install_mfa.sh
+++ b/tools/extras/install_mfa.sh