diff --git a/deepspeech/exps/u2_st/model.py b/deepspeech/exps/u2_st/model.py index f5a514c72fd8635efc4fa6b5c0e7daf44231138c..9a34cbdc2948a6df6ab9322f9a5eff31ce6b4917 100644 --- a/deepspeech/exps/u2_st/model.py +++ b/deepspeech/exps/u2_st/model.py @@ -31,7 +31,6 @@ from yacs.config import CfgNode from deepspeech.io.collator import SpeechCollator from deepspeech.io.collator import TripletSpeechCollator from deepspeech.io.dataset import ManifestDataset -from deepspeech.io.dataset import TripletManifestDataset from deepspeech.io.sampler import SortagradBatchSampler from deepspeech.io.sampler import SortagradDistributedBatchSampler from deepspeech.models.u2_st import U2STModel @@ -249,12 +248,11 @@ class U2STTrainer(Trainer): config.collator.keep_transcription_text = False # train/valid dataset, return token ids - Dataset = TripletManifestDataset if config.model.model_conf.asr_weight > 0. else ManifestDataset config.data.manifest = config.data.train_manifest - train_dataset = Dataset.from_config(config) + train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - dev_dataset = Dataset.from_config(config) + dev_dataset = ManifestDataset.from_config(config) if config.model.model_conf.asr_weight > 0.: Collator = TripletSpeechCollator diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py index 4c40c8472b424cf395e19f4448af2624c510a5fe..6f3b646c5ac5c0e19bdddc54d9ed398fbf14a263 100644 --- a/deepspeech/frontend/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -24,15 +24,15 @@ class AudioFeaturizer(): Currently, it supports feature types of linear spectrogram and mfcc. - :param specgram_type: Specgram feature type. Options: 'linear'. - :type specgram_type: str + :param spectrum_type: Specgram feature type. Options: 'linear'. + :type spectrum_type: str :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float - :param max_freq: When specgram_type is 'linear', only FFT bins + :param max_freq: When spectrum_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are - returned; when specgram_type is 'mfcc', max_feq is the + returned; when spectrum_type is 'mfcc', max_feq is the highest band edge of mel filters. :types max_freq: None|float :param target_sample_rate: Audio are resampled (if upsampling or @@ -47,7 +47,7 @@ class AudioFeaturizer(): """ def __init__(self, - specgram_type: str='linear', + spectrum_type: str='linear', feat_dim: int=None, delta_delta: bool=False, stride_ms=10.0, @@ -58,7 +58,7 @@ class AudioFeaturizer(): use_dB_normalization=True, target_dB=-20, dither=1.0): - self._specgram_type = specgram_type + self._spectrum_type = spectrum_type # mfcc and fbank using `feat_dim` self._feat_dim = feat_dim # mfcc and fbank using `delta-delta` @@ -113,27 +113,27 @@ class AudioFeaturizer(): def feature_size(self): """audio feature size""" feat_dim = 0 - if self._specgram_type == 'linear': + if self._spectrum_type == 'linear': fft_point = self._window_ms if self._fft_point is None else self._fft_point feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 + 1) - elif self._specgram_type == 'mfcc': + elif self._spectrum_type == 'mfcc': # mfcc, delta, delta-delta feat_dim = int(self._feat_dim * 3) if self._delta_delta else int(self._feat_dim) - elif self._specgram_type == 'fbank': + elif self._spectrum_type == 'fbank': # fbank, delta, delta-delta feat_dim = int(self._feat_dim * 3) if self._delta_delta else int(self._feat_dim) else: - raise ValueError("Unknown specgram_type %s. " - "Supported values: linear." % self._specgram_type) + raise ValueError("Unknown spectrum_type %s. " + "Supported values: linear." % self._spectrum_type) return feat_dim def _compute_specgram(self, audio_segment): """Extract various audio features.""" sample_rate = audio_segment.sample_rate - if self._specgram_type == 'linear': + if self._spectrum_type == 'linear': samples = audio_segment.samples return self._compute_linear_specgram( samples, @@ -141,7 +141,7 @@ class AudioFeaturizer(): stride_ms=self._stride_ms, window_ms=self._window_ms, max_freq=self._max_freq) - elif self._specgram_type == 'mfcc': + elif self._spectrum_type == 'mfcc': samples = audio_segment.to('int16') return self._compute_mfcc( samples, @@ -152,7 +152,7 @@ class AudioFeaturizer(): max_freq=self._max_freq, dither=self._dither, delta_delta=self._delta_delta) - elif self._specgram_type == 'fbank': + elif self._spectrum_type == 'fbank': samples = audio_segment.to('int16') return self._compute_fbank( samples, @@ -164,8 +164,8 @@ class AudioFeaturizer(): dither=self._dither, delta_delta=self._delta_delta) else: - raise ValueError("Unknown specgram_type %s. " - "Supported values: linear." % self._specgram_type) + raise ValueError("Unknown spectrum_type %s. " + "Supported values: linear." % self._spectrum_type) def _specgram_real(self, samples, window_size, stride_size, sample_rate): """Compute the spectrogram for samples from a real signal.""" diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index f9f7d7c270079384c67d43fb88c28f6285900bbc..7471d164a9e7bbcba85dbb24687828d193e260bf 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -17,44 +17,14 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer class SpeechFeaturizer(): - """Speech featurizer, for extracting features from both audio and transcript - contents of SpeechSegment. - - Currently, for audio parts, it supports feature types of linear - spectrogram and mfcc; for transcript parts, it only supports char-level - tokenizing and conversion into a list of token indices. Note that the - token indexing order follows the given vocabulary file. - - :param vocab_filepath: Filepath to load vocabulary for token indices - conversion. - :type specgram_type: str - :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'. - :type specgram_type: str - :param stride_ms: Striding size (in milliseconds) for generating frames. - :type stride_ms: float - :param window_ms: Window size (in milliseconds) for generating frames. - :type window_ms: float - :param max_freq: When specgram_type is 'linear', only FFT bins - corresponding to frequencies between [0, max_freq] are - returned; when specgram_type is 'mfcc', max_freq is the - highest band edge of mel filters. - :types max_freq: None|float - :param target_sample_rate: Speech are resampled (if upsampling or - downsampling is allowed) to this before - extracting spectrogram features. - :type target_sample_rate: float - :param use_dB_normalization: Whether to normalize the audio to a certain - decibels before extracting the features. - :type use_dB_normalization: bool - :param target_dB: Target audio decibels for normalization. - :type target_dB: float + """Speech and Text feature extraction. """ def __init__(self, unit_type, vocab_filepath, spm_model_prefix=None, - specgram_type='linear', + spectrum_type='linear', feat_dim=None, delta_delta=False, stride_ms=10.0, @@ -70,7 +40,7 @@ class SpeechFeaturizer(): self.window_ms = window_ms self.audio_feature = AudioFeaturizer( - specgram_type=specgram_type, + spectrum_type=spectrum_type, feat_dim=feat_dim, delta_delta=delta_delta, stride_ms=stride_ms, diff --git a/deepspeech/frontend/utility.py b/deepspeech/frontend/utility.py index 2a58123240dd1673a87cc59fe5c1cb0953a47985..f5fc3097e48a39ae88e6190242ebc0081cbc9940 100644 --- a/deepspeech/frontend/utility.py +++ b/deepspeech/frontend/utility.py @@ -15,6 +15,7 @@ import json import math import tarfile +from collections import namedtuple from typing import List from typing import Optional from typing import Text diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index c5c0a414674f0d5ffa4338885a104d09bc3d7833..553ffcb5334ab146b2e3c4d7681c09095095faae 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -32,6 +32,19 @@ __all__ = ["SpeechCollator", "TripletSpeechCollator"] logger = Log(__name__).getlog() +def tokenids(text, keep_transcription_text): + # for training text is token ids + tokens = text # token ids + + if keep_transcription_text: + # text is string, convert to unicode ord + assert isinstance(text, str), (type(text), text) + tokens = [ord(t) for t in text] + + tokens = np.array(tokens, dtype=np.int64) + return tokens + + class SpeechCollatorBase(): def __init__( self, @@ -150,7 +163,6 @@ class SpeechCollatorBase(): # extract speech feature spectrum, transcript_part = self._speech_featurizer.featurize( speech_segment, self.keep_transcription_text) - # CMVN spectrum if self._normalizer: spectrum = self._normalizer.apply(spectrum) @@ -163,38 +175,35 @@ class SpeechCollatorBase(): """batch examples Args: - batch ([List]): batch is (audio, text) + batch (List[Dict]): batch is [dict(audio, text, ...)] audio (np.ndarray) shape (T, D) text (List[int] or str): shape (U,) Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - text : (B, Umax) - text_lens: (B) + tuple(utts, xs_pad, ilens, ys_pad, olens): batched data. + utts: (B,) + xs_pad : (B, Tmax, D) + ilens: (B,) + ys_pad : (B, Umax) + olens: (B,) """ audios = [] audio_lens = [] texts = [] text_lens = [] utts = [] - for utt, audio, text in batch: + + for idx, item in enumerate(batch): + utts.append(item['utt']) + + audio = item['feat'] + text = item['text'] audio, text = self.process_utterance(audio, text) - #utt - utts.append(utt) - # audio + audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids, else text is string, convert to unicode ord - tokens = [] - if self.keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens = [ord(t) for t in text] - else: - tokens = text # token ids - tokens = np.array(tokens, dtype=np.int64) + + tokens = tokenids(text, self.keep_transcription_text) texts.append(tokens) text_lens.append(tokens.shape[0]) @@ -308,17 +317,19 @@ class TripletSpeechCollator(SpeechCollator): """batch examples Args: - batch ([List]): batch is (audio, text) + batch (List[Dict]): batch is [dict(audio, text, ...)] audio (np.ndarray) shape (T, D) text (List[int] or str): shape (U,) Returns: - tuple(audio, text, audio_lens, text_lens): batched data. - audio : (B, Tmax, D) - audio_lens: (B) - text : (B, Umax) - text_lens: (B) + tuple(utts, xs_pad, ilens, ys_pad, olens): batched data. + utts: (B,) + xs_pad : (B, Tmax, D) + ilens: (B,) + ys_pad : [(B, Umax), (B, Umax)] + olens: [(B,), (B,)] """ + utts = [] audios = [] audio_lens = [] translation_text = [] @@ -326,41 +337,38 @@ class TripletSpeechCollator(SpeechCollator): transcription_text = [] transcription_text_lens = [] - utts = [] - for utt, audio, translation, transcription in batch: + for idx, item in enumerate(batch): + utts.append(item['utt']) + + audio = item['feat'] + translation = item['text'] + transcription = item['text1'] audio, translation, transcription = self.process_utterance( audio, translation, transcription) - #utt - utts.append(utt) - # audio + audios.append(audio) # [T, D] audio_lens.append(audio.shape[0]) - # text - # for training, text is token ids - # else text is string, convert to unicode ord + tokens = [[], []] for idx, text in enumerate([translation, transcription]): - if self.keep_transcription_text: - assert isinstance(text, str), (type(text), text) - tokens[idx] = [ord(t) for t in text] - else: - tokens[idx] = text # token ids - tokens[idx] = np.array(tokens[idx], dtype=np.int64) + tokens[idx] = tokenids(text, self.keep_transcription_text) translation_text.append(tokens[0]) translation_text_lens.append(tokens[0].shape[0]) transcription_text.append(tokens[1]) transcription_text_lens.append(tokens[1].shape[0]) - padded_audios = pad_sequence( - audios, padding_value=0.0).astype(np.float32) #[B, T, D] - audio_lens = np.array(audio_lens).astype(np.int64) - padded_translation = pad_sequence( - translation_text, padding_value=IGNORE_ID).astype(np.int64) + xs_pad = pad_list(audios, 0.0).astype(np.float32) #[B, T, D] + ilens = np.array(audio_lens).astype(np.int64) + + padded_translation = pad_list(translation_text, + IGNORE_ID).astype(np.int64) translation_lens = np.array(translation_text_lens).astype(np.int64) - padded_transcription = pad_sequence( - transcription_text, padding_value=IGNORE_ID).astype(np.int64) + + padded_transcription = pad_list(transcription_text, + IGNORE_ID).astype(np.int64) transcription_lens = np.array(transcription_text_lens).astype(np.int64) - return utts, padded_audios, audio_lens, ( - padded_translation, padded_transcription), (translation_lens, - transcription_lens) + + ys_pad = (padded_translation, padded_transcription) + olens = (translation_lens, transcription_lens) + return utts, xs_pad, ilens, ys_pad, olens diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 56e534756e47fce796887b4de9e25ba9323bfec3..1945c5f7259cf429d9e343e0c4cf909497cfb165 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -19,7 +19,7 @@ from yacs.config import CfgNode from deepspeech.frontend.utility import read_manifest from deepspeech.utils.log import Log -__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"] +__all__ = ["ManifestDataset", "TransformDataset"] logger = Log(__name__).getlog() @@ -107,21 +107,7 @@ class ManifestDataset(Dataset): return len(self._manifest) def __getitem__(self, idx): - instance = self._manifest[idx] - return instance["utt"], instance["feat"], instance["text"] - - -class TripletManifestDataset(ManifestDataset): - """ - For Joint Training of Speech Translation and ASR. - text: translation, - text1: transcript. - """ - - def __getitem__(self, idx): - instance = self._manifest[idx] - return instance["utt"], instance["feat"], instance["text"], instance[ - "text1"] + return self._manifest[idx] class TransformDataset(Dataset): @@ -273,5 +259,4 @@ class AudioDataset(Dataset): return len(self.minibatch) def __getitem__(self, idx): - instance = self.minibatch[idx] - return instance["utt"], instance["feat"], instance["text"] + return self.minibatch[idx] diff --git a/deepspeech/io/reader.py b/deepspeech/io/reader.py index 30ae98f06de9d640475214caf843d3b796576ff3..e7c43a7832c726226d5fb9e62eb26134c10daddb 100644 --- a/deepspeech/io/reader.py +++ b/deepspeech/io/reader.py @@ -322,7 +322,7 @@ class LoadInputsAndTargets(): "Not supported: loader_type={}".format(filetype)) def file_type(self, filepath): - suffix = filepath.split(":")[0].split('.')[1] + suffix = filepath.split(":")[0].split('.')[-1] if suffix == 'ark': return 'mat' elif suffix == 'scp': diff --git a/docs/src/data_preparation.md b/docs/src/data_preparation.md index a3d1b3eb44cebd4dd260bb032f2feff7277c453e..34d2a835c220887cf743d5f9379c6853c9115e79 100644 --- a/docs/src/data_preparation.md +++ b/docs/src/data_preparation.md @@ -21,7 +21,7 @@ To perform z-score normalization (zero-mean, unit stddev) upon audio features, w ```bash python3 utils/compute_mean_std.py \ --num_samples 2000 \ ---specgram_type linear \ +--spectrum_type linear \ --manifest_path examples/librispeech/data/manifest.train \ --output_path examples/librispeech/data/mean_std.npz ``` diff --git a/docs/src/deepspeech_architecture.md b/docs/src/deepspeech_architecture.md index b93441222e2d2b8b0df8c745985e0ace9ede1393..5a6ca8867a94d7f138b7aa88b10d61bfd6403dce 100644 --- a/docs/src/deepspeech_architecture.md +++ b/docs/src/deepspeech_architecture.md @@ -44,7 +44,7 @@ For CMVN, a subset or the full of traininig set is chosed and be used to compute cd examples/aishell/s0 python3 ../../../utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --stride_ms=10.0 \ --window_ms=20.0 \ diff --git a/examples/1xt2x/aishell/conf/deepspeech2.yaml b/examples/1xt2x/aishell/conf/deepspeech2.yaml index 6e745e9d1c9dd9e525d49b35fd19711aa343b2ee..c2d6922638098d1f52517ebbb97049dddd89c02a 100644 --- a/examples/1xt2x/aishell/conf/deepspeech2.yaml +++ b/examples/1xt2x/aishell/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml index fbc7466f239d8597ff5001c0d684741d9921fc78..be51a9b909d9b01a8a8b4911441496a976769ea2 100644 --- a/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml +++ b/examples/1xt2x/baidu_en8k/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/1xt2x/librispeech/conf/deepspeech2.yaml b/examples/1xt2x/librispeech/conf/deepspeech2.yaml index edef07972b41cabe98ae54edfe9b560299b2e19f..ad7fb2c19f40bc04d2486c3ea728b726d7e9def4 100644 --- a/examples/1xt2x/librispeech/conf/deepspeech2.yaml +++ b/examples/1xt2x/librispeech/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 9560930acb9b831ad231279da3dc3bf4b9651a39..ffefaeb31bcab7aaf3c8a511fcbfcd5c9d250111 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/conf/deepspeech2_online.yaml b/examples/aishell/s0/conf/deepspeech2_online.yaml index 7e87594ccbfe0de36d09fcc1bbdb9d9a932603fe..cac599dc74404ad9218727b7e8b246e5957de869 100644 --- a/examples/aishell/s0/conf/deepspeech2_online.yaml +++ b/examples/aishell/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear #linear, mfcc, fbank + spectrum_type: linear #linear, mfcc, fbank feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index b106f3f28322ced93cd0462f33056bd0d5876df4..1312a12fc57069ba7fa54952e4448e41f2319285 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --stride_ms=10.0 \ --window_ms=20.0 \ diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 6f8ae135f6210757208dde85cad85e5ee776f381..9b563da27340349c26cdbe8be528d2a3c87e79b8 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index a4248459c261d442a3e23e18cf538927cda5236a..dfa9a4b0b561e7545397ca31c42e67039c145584 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index 8d5ac4d5939e108697d01d5e6624a83f50e34b16..c05c3ea251ad8c16b08bf6fdde8faf2cdceae971 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --stride_ms=10.0 \ diff --git a/examples/callcenter/s1/conf/chunk_conformer.yaml b/examples/callcenter/s1/conf/chunk_conformer.yaml index f79b8eaa0f2fb76f1b4f462dee1a8e8c79b3ba46..a853658a859c409cb7109e08b4a9c74d4610fe87 100644 --- a/examples/callcenter/s1/conf/chunk_conformer.yaml +++ b/examples/callcenter/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/callcenter/s1/conf/conformer.yaml b/examples/callcenter/s1/conf/conformer.yaml index 3b08cc7a1b6521f152a8d36de8c5a2b950d26b31..bd4f45788ef039e8bc302936ca4167dfd86c5585 100644 --- a/examples/callcenter/s1/conf/conformer.yaml +++ b/examples/callcenter/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/callcenter/s1/local/data.sh b/examples/callcenter/s1/local/data.sh index e2640ead7448f22361d1d99c07ef009af72754a4..b2a495b458c2e22502f905128dcbe753fd9bcef2 100755 --- a/examples/callcenter/s1/local/data.sh +++ b/examples/callcenter/s1/local/data.sh @@ -34,7 +34,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --stride_ms=10.0 \ diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index 3f1a376f181bf2cd7066ac0d9d4e858864d661db..47ef94211556a29865253c204ba2a2910a73d671 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/librispeech/s0/conf/deepspeech2_online.yaml b/examples/librispeech/s0/conf/deepspeech2_online.yaml index 180a6205f2af0429b3e42de8fed3772c4e8471b2..e2f9109460740f744920149a9d2135a4becf06dd 100644 --- a/examples/librispeech/s0/conf/deepspeech2_online.yaml +++ b/examples/librispeech/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index b71809869d431a716d0f5bc0ca1ffd845b943f08..e3f7b325cbb765520f1a7009602677f4e3d7153f 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=2000 \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml index 92db20f6688daea729cec7000e8ccbe6497ecb03..872b560bede2caf32d1fed5986b6ae79f155e4ac 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/s1/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index e0bc3135e0a75509da68561079127793a73b41a8..132a4f9d2c73d5c073f6c4509e6c8a09decb4be9 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 78be249cb72e2b72f158cdb5f0187f64097edcf3..769ed5f5808d2edfd840a14f666f45bb46a7c2b7 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index e4a067677f73dfddcbdb284160508e38c0645c61..c9dc1413b36f02c673dcd942323af666fcf5ff35 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 32 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index 4ad476d3785ba8b9515d3e9eea37f3a568dc8256..2b6af2295948078172c6630eec317e0d7e8dbd25 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml index 92db20f6688daea729cec7000e8ccbe6497ecb03..872b560bede2caf32d1fed5986b6ae79f155e4ac 100644 --- a/examples/librispeech/s2/conf/chunk_conformer.yaml +++ b/examples/librispeech/s2/conf/chunk_conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml index e0bc3135e0a75509da68561079127793a73b41a8..132a4f9d2c73d5c073f6c4509e6c8a09decb4be9 100644 --- a/examples/librispeech/s2/conf/chunk_transformer.yaml +++ b/examples/librispeech/s2/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml index 9a72741350219b3455388d73d354ba9004890530..bc87466eb5aebb0700a8fa6d5a940b300d1173d0 100644 --- a/examples/librispeech/s2/conf/conformer.yaml +++ b/examples/librispeech/s2/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 16 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s2/local/data.sh b/examples/librispeech/s2/local/data.sh index 4ad476d3785ba8b9515d3e9eea37f3a568dc8256..2b6af2295948078172c6630eec317e0d7e8dbd25 100755 --- a/examples/librispeech/s2/local/data.sh +++ b/examples/librispeech/s2/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml index 1aad86d22f02afc68d9eba0c0cb76406873fbfc4..8c03e328db48f7169f1f0c049dca0a5ca5622d81 100644 --- a/examples/ted_en_zh/t0/conf/transformer.yaml +++ b/examples/ted_en_zh/t0/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: # augmentation_config: conf/augmentation.json batch_size: 10 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml index 0144c40d4bf0f2b66bafefbda41acbc1708be170..cbfae93e6d21aa8dec98885f50afac5df25aed2b 100644 --- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml @@ -18,7 +18,7 @@ collator: # augmentation_config: conf/augmentation.json batch_size: 10 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/ted_en_zh/t0/local/data.sh b/examples/ted_en_zh/t0/local/data.sh index 32cfd9d7af1106316cdf98de9a77c676c6677ac2..43911c348230e0cca1ff70dcd8358e577b3fc7ec 100755 --- a/examples/ted_en_zh/t0/local/data.sh +++ b/examples/ted_en_zh/t0/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml index c3b519968822b888693b8cfb302a3fb7ad4156a6..1ae9acd095c8348dfcc862c4481cb8467b2520ba 100644 --- a/examples/timit/s1/conf/transformer.yaml +++ b/examples/timit/s1/conf/transformer.yaml @@ -17,7 +17,7 @@ collator: augmentation_config: "" batch_size: 64 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/timit/s1/local/data.sh b/examples/timit/s1/local/data.sh index 1d16f454a2a98a020753b56feb29cc49f647e830..f4be90482ea37c16bc0d680ef326bad08dd1b1c2 100755 --- a/examples/timit/s1/local/data.sh +++ b/examples/timit/s1/local/data.sh @@ -45,7 +45,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 408996557e79a058cba4fec8eea107ffdea4eb87..a7940cb2ffdfc13afe7518143d54bd9dd2701f56 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/tiny/s0/conf/deepspeech2_online.yaml b/examples/tiny/s0/conf/deepspeech2_online.yaml index 0098a226c8c7ee8ffd7abb1fda280a1427e4e633..7e30409fb1d9bae52db354c4cd20ccb7dbd9f333 100644 --- a/examples/tiny/s0/conf/deepspeech2_online.yaml +++ b/examples/tiny/s0/conf/deepspeech2_online.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: - specgram_type: linear + spectrum_type: linear feat_dim: delta_delta: False stride_ms: 10.0 diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index 02fdb70673e639c47ccc79c550e6507fe016cb45..fabf2e4048c4a08425a9e2295a36e0a53bed96d7 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ --num_samples=64 \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index be2e82f9e675e484f28ef0aead5306e36eefbc33..f3c7e1dd8771125110b8a5282e169e0454da957f 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 93439a85782f8cb08cc3040ae3a80625d9572117..8300575484cca49cb9825b67e01df8b6601ac8f4 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 9bb67c44e54a968d2fb6e891592aaf0d0a2e3db4..628e3b77e92df753f169ac8348ae7c57dbb81472 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index fcbe1da4ac2f9911f22963aaaa1c8cbb18c7e991..27ffcae4b6621532f5167b44f95011826226c65d 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -18,7 +18,7 @@ collator: augmentation_config: conf/augmentation.json batch_size: 4 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index 2aea250b5decf5ecfffa92c7330d34d6e3e6cf81..b5dbd5812f6b6006db0f0a042d924dcf5fc345f2 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ --num_samples=64 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/utils/compute_mean_std.py b/utils/compute_mean_std.py index a468153d3d2bba4a9afee02e933e6bf3c942606f..0f63715a286c1af82a99f69eb56cf273620fef3d 100755 --- a/utils/compute_mean_std.py +++ b/utils/compute_mean_std.py @@ -27,7 +27,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('num_samples', int, 2000, "# of samples to for statistics.") -add_arg('specgram_type', str, +add_arg('spectrum_type', str, 'linear', "Audio feature type. Options: linear, mfcc, fbank.", choices=['linear', 'mfcc', 'fbank']) @@ -58,7 +58,7 @@ def main(): augmentation_pipeline = AugmentationPipeline('{}') audio_featurizer = AudioFeaturizer( - specgram_type=args.specgram_type, + spectrum_type=args.spectrum_type, feat_dim=args.feat_dim, delta_delta=args.delta_delta, stride_ms=args.stride_ms,