diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py
index f5a514c72fd8635efc4fa6b5c0e7daf44231138c..9a34cbdc2948a6df6ab9322f9a5eff31ce6b4917 100644
--- a/deepspeech/exps/u2_st/model.py
+++ b/deepspeech/exps/u2_st/model.py
@@ -31,7 +31,6 @@ from yacs.config import CfgNode
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.collator import TripletSpeechCollator
 from deepspeech.io.dataset import ManifestDataset
-from deepspeech.io.dataset import TripletManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
 from deepspeech.io.sampler import SortagradDistributedBatchSampler
 from deepspeech.models.u2_st import U2STModel
@@ -249,12 +248,11 @@ class U2STTrainer(Trainer):
         config.collator.keep_transcription_text = False
 
         # train/valid dataset, return token ids
-        Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset
         config.data.manifest = config.data.train_manifest
-        train_dataset = Dataset.from_config(config)
+        train_dataset = ManifestDataset.from_config(config)
 
         config.data.manifest = config.data.dev_manifest
-        dev_dataset = Dataset.from_config(config)
+        dev_dataset = ManifestDataset.from_config(config)
 
         if config.model.model_conf.asr_weight > 0.:
             Collator = TripletSpeechCollator
diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py
index 4c40c8472b424cf395e19f4448af2624c510a5fe..6f3b646c5ac5c0e19bdddc54d9ed398fbf14a263 100644
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -24,15 +24,15 @@ class AudioFeaturizer():
 
     Currently, it supports feature types of linear spectrogram and mfcc.
 
-    :param specgram_type: Specgram feature type. Options: 'linear'.
-    :type specgram_type: str
+    :param spectrum_type: Specgram feature type. Options: 'linear'.
+    :type spectrum_type: str
     :param stride_ms: Striding size (in milliseconds) for generating frames.
     :type stride_ms: float
     :param window_ms: Window size (in milliseconds) for generating frames.
     :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
+    :param max_freq: When spectrum_type is 'linear', only FFT bins
                      corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_feq is the
+                     returned; when spectrum_type is 'mfcc', max_feq is the
                      highest band edge of mel filters.
     :types max_freq: None|float
     :param target_sample_rate: Audio are resampled (if upsampling or
@@ -47,7 +47,7 @@ class AudioFeaturizer():
     """
 
     def __init__(self,
-                 specgram_type: str='linear',
+                 spectrum_type: str='linear',
                  feat_dim: int=None,
                  delta_delta: bool=False,
                  stride_ms=10.0,
@@ -58,7 +58,7 @@ class AudioFeaturizer():
                  use_dB_normalization=True,
                  target_dB=-20,
                  dither=1.0):
-        self._specgram_type = specgram_type
+        self._spectrum_type = spectrum_type
         # mfcc and fbank using `feat_dim`
         self._feat_dim = feat_dim
         # mfcc and fbank using `delta-delta`
@@ -113,27 +113,27 @@ class AudioFeaturizer():
     def feature_size(self):
         """audio feature size"""
         feat_dim = 0
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
             fft_point = self._window_ms if self._fft_point is None else self._fft_point
             feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                            1)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
             # mfcc, delta, delta-delta
             feat_dim = int(self._feat_dim *
                            3) if self._delta_delta else int(self._feat_dim)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
             # fbank, delta, delta-delta
             feat_dim = int(self._feat_dim *
                            3) if self._delta_delta else int(self._feat_dim)
         else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
         return feat_dim
 
     def _compute_specgram(self, audio_segment):
         """Extract various audio features."""
         sample_rate = audio_segment.sample_rate
-        if self._specgram_type == 'linear':
+        if self._spectrum_type == 'linear':
             samples = audio_segment.samples
             return self._compute_linear_specgram(
                 samples,
@@ -141,7 +141,7 @@ class AudioFeaturizer():
                 stride_ms=self._stride_ms,
                 window_ms=self._window_ms,
                 max_freq=self._max_freq)
-        elif self._specgram_type == 'mfcc':
+        elif self._spectrum_type == 'mfcc':
             samples = audio_segment.to('int16')
             return self._compute_mfcc(
                 samples,
@@ -152,7 +152,7 @@ class AudioFeaturizer():
                 max_freq=self._max_freq,
                 dither=self._dither,
                 delta_delta=self._delta_delta)
-        elif self._specgram_type == 'fbank':
+        elif self._spectrum_type == 'fbank':
             samples = audio_segment.to('int16')
             return self._compute_fbank(
                 samples,
@@ -164,8 +164,8 @@ class AudioFeaturizer():
                 dither=self._dither,
                 delta_delta=self._delta_delta)
         else:
-            raise ValueError("Unknown specgram_type %s. "
-                             "Supported values: linear." % self._specgram_type)
+            raise ValueError("Unknown spectrum_type %s. "
+                             "Supported values: linear." % self._spectrum_type)
 
     def _specgram_real(self, samples, window_size, stride_size, sample_rate):
         """Compute the spectrogram for samples from a real signal."""
diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py
index f9f7d7c270079384c67d43fb88c28f6285900bbc..7471d164a9e7bbcba85dbb24687828d193e260bf 100644
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 
 
 class SpeechFeaturizer():
-    """Speech featurizer, for extracting features from both audio and transcript
-    contents of SpeechSegment.
-
-    Currently, for audio parts, it supports feature types of linear
-    spectrogram and mfcc; for transcript parts, it only supports char-level
-    tokenizing and conversion into a list of token indices. Note that the
-    token indexing order follows the given vocabulary file.
-
-    :param vocab_filepath: Filepath to load vocabulary for token indices
-                           conversion.
-    :type specgram_type: str
-    :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'.
-    :type specgram_type: str
-    :param stride_ms: Striding size (in milliseconds) for generating frames.
-    :type stride_ms: float
-    :param window_ms: Window size (in milliseconds) for generating frames.
-    :type window_ms: float
-    :param max_freq: When specgram_type is 'linear', only FFT bins
-                     corresponding to frequencies between [0, max_freq] are
-                     returned; when specgram_type is 'mfcc', max_freq is the
-                     highest band edge of mel filters.
-    :types max_freq: None|float
-    :param target_sample_rate: Speech are resampled (if upsampling or
-                               downsampling is allowed) to this before
-                               extracting spectrogram features.
-    :type target_sample_rate: float
-    :param use_dB_normalization: Whether to normalize the audio to a certain
-                                 decibels before extracting the features.
-    :type use_dB_normalization: bool
-    :param target_dB: Target audio decibels for normalization.
-    :type target_dB: float
+    """Speech and Text feature extraction.
     """
 
     def __init__(self,
                  unit_type,
                  vocab_filepath,
                  spm_model_prefix=None,
-                 specgram_type='linear',
+                 spectrum_type='linear',
                  feat_dim=None,
                  delta_delta=False,
                  stride_ms=10.0,
@@ -70,7 +40,7 @@ class SpeechFeaturizer():
         self.window_ms = window_ms
 
         self.audio_feature = AudioFeaturizer(
-            specgram_type=specgram_type,
+            spectrum_type=spectrum_type,
             feat_dim=feat_dim,
             delta_delta=delta_delta,
             stride_ms=stride_ms,
diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py
index 2a58123240dd1673a87cc59fe5c1cb0953a47985..f5fc3097e48a39ae88e6190242ebc0081cbc9940 100644
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -15,6 +15,7 @@
 import json
 import math
 import tarfile
+from collections import namedtuple
 from typing import List
 from typing import Optional
 from typing import Text
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index c5c0a414674f0d5ffa4338885a104d09bc3d7833..553ffcb5334ab146b2e3c4d7681c09095095faae 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"]
 logger = Log(__name__).getlog()
 
 
+def tokenids(text, keep_transcription_text):
+    # for training text is token ids 
+    tokens = text  # token ids
+
+    if keep_transcription_text:
+        # text is string, convert to unicode ord
+        assert isinstance(text, str), (type(text), text)
+        tokens = [ord(t) for t in text]
+
+    tokens = np.array(tokens, dtype=np.int64)
+    return tokens
+
+
 class SpeechCollatorBase():
     def __init__(
             self,
@@ -150,7 +163,6 @@ class SpeechCollatorBase():
             # extract speech feature
             spectrum, transcript_part = self._speech_featurizer.featurize(
                 speech_segment, self.keep_transcription_text)
-
             # CMVN spectrum
             if self._normalizer:
                 spectrum = self._normalizer.apply(spectrum)
@@ -163,38 +175,35 @@ class SpeechCollatorBase():
         """batch examples
 
         Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                 audio (np.ndarray) shape (T, D)
                 text (List[int] or str): shape (U,)
 
         Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : (B, Umax)
+                olens: (B,)
         """
         audios = []
         audio_lens = []
         texts = []
         text_lens = []
         utts = []
-        for utt, audio, text in batch:
+
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            text = item['text']
             audio, text = self.process_utterance(audio, text)
-            #utt
-            utts.append(utt)
-            # audio
+
             audios.append(audio)  # [T, D]
             audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids, else text is string, convert to unicode ord
-            tokens = []
-            if self.keep_transcription_text:
-                assert isinstance(text, str), (type(text), text)
-                tokens = [ord(t) for t in text]
-            else:
-                tokens = text  # token ids
-            tokens = np.array(tokens, dtype=np.int64)
+
+            tokens = tokenids(text, self.keep_transcription_text)
             texts.append(tokens)
             text_lens.append(tokens.shape[0])
 
@@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator):
         """batch examples
 
         Args:
-            batch ([List]): batch is (audio, text)
+            batch (List[Dict]): batch is [dict(audio, text, ...)]
                 audio (np.ndarray) shape (T, D)
                 text (List[int] or str): shape (U,)
 
         Returns:
-            tuple(audio, text, audio_lens, text_lens): batched data.
-                audio : (B, Tmax, D)
-                audio_lens: (B)
-                text : (B, Umax)
-                text_lens: (B)
+            tuple(utts, xs_pad, ilens, ys_pad, olens): batched data.
+                utts: (B,)
+                xs_pad : (B, Tmax, D)
+                ilens: (B,)
+                ys_pad : [(B, Umax), (B, Umax)]
+                olens: [(B,), (B,)]
         """
+        utts = []
         audios = []
         audio_lens = []
         translation_text = []
@@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator):
         transcription_text = []
         transcription_text_lens = []
 
-        utts = []
-        for utt, audio, translation, transcription in batch:
+        for idx, item in enumerate(batch):
+            utts.append(item['utt'])
+
+            audio = item['feat']
+            translation = item['text']
+            transcription = item['text1']
             audio, translation, transcription = self.process_utterance(
                 audio, translation, transcription)
-            #utt
-            utts.append(utt)
-            # audio
+
             audios.append(audio)  # [T, D]
             audio_lens.append(audio.shape[0])
-            # text
-            # for training, text is token ids
-            # else text is string, convert to unicode ord
+
             tokens = [[], []]
             for idx, text in enumerate([translation, transcription]):
-                if self.keep_transcription_text:
-                    assert isinstance(text, str), (type(text), text)
-                    tokens[idx] = [ord(t) for t in text]
-                else:
-                    tokens[idx] = text  # token ids
-                tokens[idx] = np.array(tokens[idx], dtype=np.int64)
+                tokens[idx] = tokenids(text, self.keep_transcription_text)
 
             translation_text.append(tokens[0])
             translation_text_lens.append(tokens[0].shape[0])
             transcription_text.append(tokens[1])
             transcription_text_lens.append(tokens[1].shape[0])
 
-        padded_audios = pad_sequence(
-            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
-        audio_lens = np.array(audio_lens).astype(np.int64)
-        padded_translation = pad_sequence(
-            translation_text, padding_value=IGNORE_ID).astype(np.int64)
+        xs_pad = pad_list(audios, 0.0).astype(np.float32)  #[B, T, D]
+        ilens = np.array(audio_lens).astype(np.int64)
+
+        padded_translation = pad_list(translation_text,
+                                      IGNORE_ID).astype(np.int64)
         translation_lens = np.array(translation_text_lens).astype(np.int64)
-        padded_transcription = pad_sequence(
-            transcription_text, padding_value=IGNORE_ID).astype(np.int64)
+
+        padded_transcription = pad_list(transcription_text,
+                                        IGNORE_ID).astype(np.int64)
         transcription_lens = np.array(transcription_text_lens).astype(np.int64)
-        return utts, padded_audios, audio_lens, (
-            padded_translation, padded_transcription), (translation_lens,
-                                                        transcription_lens)
+
+        ys_pad = (padded_translation, padded_transcription)
+        olens = (translation_lens, transcription_lens)
+        return utts, xs_pad, ilens, ys_pad, olens
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 56e534756e47fce796887b4de9e25ba9323bfec3..1945c5f7259cf429d9e343e0c4cf909497cfb165 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -19,7 +19,7 @@ from yacs.config import CfgNode
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.utils.log import Log
 
-__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"]
+__all__ = ["ManifestDataset", "TransformDataset"]
 
 logger = Log(__name__).getlog()
 
@@ -107,21 +107,7 @@ class ManifestDataset(Dataset):
         return len(self._manifest)
 
     def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"]
-
-
-class TripletManifestDataset(ManifestDataset):
-    """
-        For Joint Training of Speech Translation and ASR.
-        text: translation,
-        text1: transcript.
-    """
-
-    def __getitem__(self, idx):
-        instance = self._manifest[idx]
-        return instance["utt"], instance["feat"], instance["text"], instance[
-            "text1"]
+        return self._manifest[idx]
 
 
 class TransformDataset(Dataset):
@@ -273,5 +259,4 @@ class AudioDataset(Dataset):
         return len(self.minibatch)
 
     def __getitem__(self, idx):
-        instance = self.minibatch[idx]
-        return instance["utt"], instance["feat"], instance["text"]
+        return self.minibatch[idx]
diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py
index 30ae98f06de9d640475214caf843d3b796576ff3..e7c43a7832c726226d5fb9e62eb26134c10daddb 100644
--- a/deepspeech/io/reader.py
+++ b/deepspeech/io/reader.py
@@ -322,7 +322,7 @@ class LoadInputsAndTargets():
                 "Not supported: loader_type={}".format(filetype))
 
     def file_type(self, filepath):
-        suffix = filepath.split(":")[0].split('.')[1]
+        suffix = filepath.split(":")[0].split('.')[-1]
         if suffix == 'ark':
             return 'mat'
         elif suffix == 'scp':
diff --git a/docs/src/data_preparation.md b/docs/src/data_preparation.md
index a3d1b3eb44cebd4dd260bb032f2feff7277c453e..34d2a835c220887cf743d5f9379c6853c9115e79 100644
--- a/docs/src/data_preparation.md
+++ b/docs/src/data_preparation.md
@@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w
 ```bash
 python3 utils/compute_mean_std.py \
 --num_samples 2000 \
---specgram_type linear \
+--spectrum_type linear \
 --manifest_path examples/librispeech/data/manifest.train \
 --output_path examples/librispeech/data/mean_std.npz
 ```
diff --git a/docs/src/deepspeech_architecture.md b/docs/src/deepspeech_architecture.md
index b93441222e2d2b8b0df8c745985e0ace9ede1393..5a6ca8867a94d7f138b7aa88b10d61bfd6403dce 100644
--- a/docs/src/deepspeech_architecture.md
+++ b/docs/src/deepspeech_architecture.md
@@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute
 cd examples/aishell/s0
 python3 ../../../utils/compute_mean_std.py \
      --manifest_path="data/manifest.train.raw" \
-     --specgram_type="linear" \
+     --spectrum_type="linear" \
      --delta_delta=false \
      --stride_ms=10.0 \
      --window_ms=20.0 \
diff --git a/examples/1xt2x/aishell/conf/deepspeech2.yaml b/examples/1xt2x/aishell/conf/deepspeech2.yaml
index 6e745e9d1c9dd9e525d49b35fd19711aa343b2ee..c2d6922638098d1f52517ebbb97049dddd89c02a 100644
--- a/examples/1xt2x/aishell/conf/deepspeech2.yaml
+++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
index fbc7466f239d8597ff5001c0d684741d9921fc78..be51a9b909d9b01a8a8b4911441496a976769ea2 100644
--- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
+++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
index edef07972b41cabe98ae54edfe9b560299b2e19f..ad7fb2c19f40bc04d2486c3ea728b726d7e9def4 100644
--- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml
+++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml
index 9560930acb9b831ad231279da3dc3bf4b9651a39..ffefaeb31bcab7aaf3c8a511fcbfcd5c9d250111 100644
--- a/examples/aishell/s0/conf/deepspeech2.yaml
+++ b/examples/aishell/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml
index 7e87594ccbfe0de36d09fcc1bbdb9d9a932603fe..cac599dc74404ad9218727b7e8b246e5957de869 100644
--- a/examples/aishell/s0/conf/deepspeech2_online.yaml
+++ b/examples/aishell/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear #linear, mfcc, fbank
+  spectrum_type: linear #linear, mfcc, fbank
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh
index b106f3f28322ced93cd0462f33056bd0d5876df4..1312a12fc57069ba7fa54952e4448e41f2319285 100755
--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --stride_ms=10.0 \
     --window_ms=20.0 \
diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml
index 6f8ae135f6210757208dde85cad85e5ee776f381..9b563da27340349c26cdbe8be528d2a3c87e79b8 100644
--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml
index a4248459c261d442a3e23e18cf538927cda5236a..dfa9a4b0b561e7545397ca31c42e67039c145584 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh
index 8d5ac4d5939e108697d01d5e6624a83f50e34b16..c05c3ea251ad8c16b08bf6fdde8faf2cdceae971 100755
--- a/examples/aishell/s1/local/data.sh
+++ b/examples/aishell/s1/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --stride_ms=10.0 \
diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/s1/conf/chunk_conformer.yaml
index f79b8eaa0f2fb76f1b4f462dee1a8e8c79b3ba46..a853658a859c409cb7109e08b4a9c74d4610fe87 100644
--- a/examples/callcenter/s1/conf/chunk_conformer.yaml
+++ b/examples/callcenter/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/s1/conf/conformer.yaml
index 3b08cc7a1b6521f152a8d36de8c5a2b950d26b31..bd4f45788ef039e8bc302936ca4167dfd86c5585 100644
--- a/examples/callcenter/s1/conf/conformer.yaml
+++ b/examples/callcenter/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh
index e2640ead7448f22361d1d99c07ef009af72754a4..b2a495b458c2e22502f905128dcbe753fd9bcef2 100755
--- a/examples/callcenter/s1/local/data.sh
+++ b/examples/callcenter/s1/local/data.sh
@@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     num_workers=$(nproc)
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --stride_ms=10.0 \
diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml
index 3f1a376f181bf2cd7066ac0d9d4e858864d661db..47ef94211556a29865253c204ba2a2910a73d671 100644
--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   target_sample_rate: 16000
   max_freq: None
   n_fft: None
diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml
index 180a6205f2af0429b3e42de8fed3772c4e8471b2..e2f9109460740f744920149a9d2135a4becf06dd 100644
--- a/examples/librispeech/s0/conf/deepspeech2_online.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   target_sample_rate: 16000
   max_freq: None
   n_fft: None
diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh
index b71809869d431a716d0f5bc0ca1ffd845b943f08..e3f7b325cbb765520f1a7009602677f4e3d7153f 100755
--- a/examples/librispeech/s0/local/data.sh
+++ b/examples/librispeech/s0/local/data.sh
@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=2000 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
     --stride_ms=10.0 \
diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml
index 92db20f6688daea729cec7000e8ccbe6497ecb03..872b560bede2caf32d1fed5986b6ae79f155e4ac 100644
--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml
index e0bc3135e0a75509da68561079127793a73b41a8..132a4f9d2c73d5c073f6c4509e6c8a09decb4be9 100644
--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml
index 78be249cb72e2b72f158cdb5f0187f64097edcf3..769ed5f5808d2edfd840a14f666f45bb46a7c2b7 100644
--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml
index e4a067677f73dfddcbdb284160508e38c0645c61..c9dc1413b36f02c673dcd942323af666fcf5ff35 100644
--- a/examples/librispeech/s1/conf/transformer.yaml
+++ b/examples/librispeech/s1/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 32
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh
index 4ad476d3785ba8b9515d3e9eea37f3a568dc8256..2b6af2295948078172c6630eec317e0d7e8dbd25 100755
--- a/examples/librispeech/s1/local/data.sh
+++ b/examples/librispeech/s1/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml
index 92db20f6688daea729cec7000e8ccbe6497ecb03..872b560bede2caf32d1fed5986b6ae79f155e4ac 100644
--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml
index e0bc3135e0a75509da68561079127793a73b41a8..132a4f9d2c73d5c073f6c4509e6c8a09decb4be9 100644
--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml
index 9a72741350219b3455388d73d354ba9004890530..bc87466eb5aebb0700a8fa6d5a940b300d1173d0 100644
--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 16
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh
index 4ad476d3785ba8b9515d3e9eea37f3a568dc8256..2b6af2295948078172c6630eec317e0d7e8dbd25 100755
--- a/examples/librispeech/s2/local/data.sh
+++ b/examples/librispeech/s2/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml
index 1aad86d22f02afc68d9eba0c0cb76406873fbfc4..8c03e328db48f7169f1f0c049dca0a5ca5622d81 100644
--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   # augmentation_config: conf/augmentation.json
   batch_size: 10
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
index 0144c40d4bf0f2b66bafefbda41acbc1708be170..cbfae93e6d21aa8dec98885f50afac5df25aed2b 100644
--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@@ -18,7 +18,7 @@ collator:
   # augmentation_config: conf/augmentation.json
   batch_size: 10
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh
index 32cfd9d7af1106316cdf98de9a77c676c6677ac2..43911c348230e0cca1ff70dcd8358e577b3fc7ec 100755
--- a/examples/ted_en_zh/t0/local/data.sh
+++ b/examples/ted_en_zh/t0/local/data.sh
@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml
index c3b519968822b888693b8cfb302a3fb7ad4156a6..1ae9acd095c8348dfcc862c4481cb8467b2520ba 100644
--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@@ -17,7 +17,7 @@ collator:
   augmentation_config: ""
   batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh
index 1d16f454a2a98a020753b56feb29cc49f647e830..f4be90482ea37c16bc0d680ef326bad08dd1b1c2 100755
--- a/examples/timit/s1/local/data.sh
+++ b/examples/timit/s1/local/data.sh
@@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.train.raw" \
     --num_samples=-1 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index 408996557e79a058cba4fec8eea107ffdea4eb87..a7940cb2ffdfc13afe7518143d54bd9dd2701f56 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml
index 0098a226c8c7ee8ffd7abb1fda280a1427e4e633..7e30409fb1d9bae52db354c4cd20ccb7dbd9f333 100644
--- a/examples/tiny/s0/conf/deepspeech2_online.yaml
+++ b/examples/tiny/s0/conf/deepspeech2_online.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
-  specgram_type: linear
+  spectrum_type: linear
   feat_dim: 
   delta_delta: False
   stride_ms: 10.0
diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh
index 02fdb70673e639c47ccc79c550e6507fe016cb45..fabf2e4048c4a08425a9e2295a36e0a53bed96d7 100755
--- a/examples/tiny/s0/local/data.sh
+++ b/examples/tiny/s0/local/data.sh
@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.tiny.raw" \
     --num_samples=64 \
-    --specgram_type="linear" \
+    --spectrum_type="linear" \
     --delta_delta=false \
     --sample_rate=16000 \
     --stride_ms=10.0 \
diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml
index be2e82f9e675e484f28ef0aead5306e36eefbc33..f3c7e1dd8771125110b8a5282e169e0454da957f 100644
--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml
index 93439a85782f8cb08cc3040ae3a80625d9572117..8300575484cca49cb9825b67e01df8b6601ac8f4 100644
--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml
index 9bb67c44e54a968d2fb6e891592aaf0d0a2e3db4..628e3b77e92df753f169ac8348ae7c57dbb81472 100644
--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index fcbe1da4ac2f9911f22963aaaa1c8cbb18c7e991..27ffcae4b6621532f5167b44f95011826226c65d 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -18,7 +18,7 @@ collator:
   augmentation_config: conf/augmentation.json
   batch_size: 4
   raw_wav: True  # use raw_wav or kaldi feature
-  specgram_type: fbank #linear, mfcc, fbank
+  spectrum_type: fbank #linear, mfcc, fbank
   feat_dim: 80
   delta_delta: False
   dither: 1.0
diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh
index 2aea250b5decf5ecfffa92c7330d34d6e3e6cf81..b5dbd5812f6b6006db0f0a042d924dcf5fc345f2 100755
--- a/examples/tiny/s1/local/data.sh
+++ b/examples/tiny/s1/local/data.sh
@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
     --manifest_path="data/manifest.tiny.raw" \
     --num_samples=64 \
-    --specgram_type="fbank" \
+    --spectrum_type="fbank" \
     --feat_dim=80 \
     --delta_delta=false \
     --sample_rate=16000 \
diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py
index a468153d3d2bba4a9afee02e933e6bf3c942606f..0f63715a286c1af82a99f69eb56cf273620fef3d 100755
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
 add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
 
-add_arg('specgram_type',    str,
+add_arg('spectrum_type',    str,
         'linear',
         "Audio feature type. Options: linear, mfcc, fbank.",
         choices=['linear', 'mfcc', 'fbank'])
@@ -58,7 +58,7 @@ def main():
 
     augmentation_pipeline = AugmentationPipeline('{}')
     audio_featurizer = AudioFeaturizer(
-        specgram_type=args.specgram_type,
+        spectrum_type=args.spectrum_type,
         feat_dim=args.feat_dim,
         delta_delta=args.delta_delta,
         stride_ms=args.stride_ms,