From 6f83be1ad30825222e18a2b0566caf63ae4132b4 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Sun, 3 Oct 2021 03:36:01 +0000 Subject: [PATCH] fix spectrum_type typo --- deepspeech/exps/deepspeech2/config.py | 2 +- .../frontend/featurizer/audio_featurizer.py | 32 +++++++++---------- .../frontend/featurizer/speech_featurizer.py | 14 ++++---- deepspeech/io/__init__.py | 4 +-- deepspeech/io/dataset.py | 10 +++--- examples/aishell/s0/conf/deepspeech2.yaml | 2 +- examples/aishell/s0/local/data.sh | 2 +- examples/aishell/s1/conf/chunk_conformer.yaml | 2 +- examples/aishell/s1/conf/conformer.yaml | 2 +- examples/aishell/s1/local/data.sh | 2 +- examples/librispeech/s0/conf/deepspeech2.yaml | 2 +- examples/librispeech/s0/local/data.sh | 2 +- .../librispeech/s1/conf/chunk_confermer.yaml | 2 +- .../s1/conf/chunk_transformer.yaml | 2 +- examples/librispeech/s1/conf/conformer.yaml | 2 +- examples/librispeech/s1/conf/transformer.yaml | 2 +- examples/librispeech/s1/local/data.sh | 2 +- examples/tiny/s0/conf/deepspeech2.yaml | 2 +- examples/tiny/s0/local/data.sh | 4 +-- examples/tiny/s1/conf/chunk_confermer.yaml | 2 +- examples/tiny/s1/conf/chunk_transformer.yaml | 2 +- examples/tiny/s1/conf/conformer.yaml | 2 +- examples/tiny/s1/conf/transformer.yaml | 2 +- examples/tiny/s1/local/data.sh | 2 +- 24 files changed, 51 insertions(+), 51 deletions(-) diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index 89c2bfa9..633e38ff 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -32,7 +32,7 @@ _C.data = CN( window_ms=20.0, # ms n_fft=None, # fft points max_freq=None, # None for samplerate/2 - specgram_type='linear', # 'linear', 'mfcc', 'fbank' + spectrum_type='linear', # 'linear', 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank' delat_delta=False, # 'mfcc', 'fbank' target_sample_rate=16000, # target sample rate diff --git a/deepspeech/frontend/featurizer/audio_featurizer.py b/deepspeech/frontend/featurizer/audio_featurizer.py index 11c1fa2d..2f3163fa 100644 --- a/deepspeech/frontend/featurizer/audio_featurizer.py +++ b/deepspeech/frontend/featurizer/audio_featurizer.py @@ -24,15 +24,15 @@ class AudioFeaturizer(object): Currently, it supports feature types of linear spectrogram and mfcc. - :param specgram_type: Specgram feature type. Options: 'linear'. - :type specgram_type: str + :param spectrum_type: Specgram feature type. Options: 'linear'. + :type spectrum_type: str :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float - :param max_freq: When specgram_type is 'linear', only FFT bins + :param max_freq: When spectrum_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are - returned; when specgram_type is 'mfcc', max_feq is the + returned; when spectrum_type is 'mfcc', max_feq is the highest band edge of mel filters. :types max_freq: None|float :param target_sample_rate: Audio are resampled (if upsampling or @@ -47,7 +47,7 @@ class AudioFeaturizer(object): """ def __init__(self, - specgram_type: str='linear', + spectrum_type: str='linear', feat_dim: int=None, delta_delta: bool=False, stride_ms=10.0, @@ -58,7 +58,7 @@ class AudioFeaturizer(object): use_dB_normalization=True, target_dB=-20, dither=1.0): - self._specgram_type = specgram_type + self._spectrum_type = spectrum_type # mfcc and fbank using `feat_dim` self._feat_dim = feat_dim # mfcc and fbank using `delta-delta` @@ -113,27 +113,27 @@ class AudioFeaturizer(object): def feature_size(self): """audio feature size""" feat_dim = 0 - if self._specgram_type == 'linear': + if self._spectrum_type == 'linear': fft_point = self._window_ms if self._fft_point is None else self._fft_point feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 + 1) - elif self._specgram_type == 'mfcc': + elif self._spectrum_type == 'mfcc': # mfcc, delta, delta-delta feat_dim = int(self._feat_dim * 3) if self._delta_delta else int(self._feat_dim) - elif self._specgram_type == 'fbank': + elif self._spectrum_type == 'fbank': # fbank, delta, delta-delta feat_dim = int(self._feat_dim * 3) if self._delta_delta else int(self._feat_dim) else: - raise ValueError("Unknown specgram_type %s. " - "Supported values: linear." % self._specgram_type) + raise ValueError("Unknown spectrum_type %s. " + "Supported values: linear." % self._spectrum_type) return feat_dim def _compute_specgram(self, audio_segment): """Extract various audio features.""" sample_rate = audio_segment.sample_rate - if self._specgram_type == 'linear': + if self._spectrum_type == 'linear': samples = audio_segment.samples return self._compute_linear_specgram( samples, @@ -141,7 +141,7 @@ class AudioFeaturizer(object): stride_ms=self._stride_ms, window_ms=self._window_ms, max_freq=self._max_freq) - elif self._specgram_type == 'mfcc': + elif self._spectrum_type == 'mfcc': samples = audio_segment.to('int16') return self._compute_mfcc( samples, @@ -152,7 +152,7 @@ class AudioFeaturizer(object): max_freq=self._max_freq, dither=self._dither, delta_delta=self._delta_delta) - elif self._specgram_type == 'fbank': + elif self._spectrum_type == 'fbank': samples = audio_segment.to('int16') return self._compute_fbank( samples, @@ -164,8 +164,8 @@ class AudioFeaturizer(object): dither=self._dither, delta_delta=self._delta_delta) else: - raise ValueError("Unknown specgram_type %s. " - "Supported values: linear." % self._specgram_type) + raise ValueError("Unknown spectrum_type %s. " + "Supported values: linear." % self._spectrum_type) def _compute_linear_specgram(self, samples, diff --git a/deepspeech/frontend/featurizer/speech_featurizer.py b/deepspeech/frontend/featurizer/speech_featurizer.py index e6761cb5..50856e16 100644 --- a/deepspeech/frontend/featurizer/speech_featurizer.py +++ b/deepspeech/frontend/featurizer/speech_featurizer.py @@ -27,16 +27,16 @@ class SpeechFeaturizer(object): :param vocab_filepath: Filepath to load vocabulary for token indices conversion. - :type specgram_type: str - :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'. - :type specgram_type: str + :type spectrum_type: str + :param spectrum_type: Specgram feature type. Options: 'linear', 'mfcc'. + :type spectrum_type: str :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float - :param max_freq: When specgram_type is 'linear', only FFT bins + :param max_freq: When spectrum_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are - returned; when specgram_type is 'mfcc', max_freq is the + returned; when spectrum_type is 'mfcc', max_freq is the highest band edge of mel filters. :types max_freq: None|float :param target_sample_rate: Speech are resampled (if upsampling or @@ -54,7 +54,7 @@ class SpeechFeaturizer(object): unit_type, vocab_filepath, spm_model_prefix=None, - specgram_type='linear', + spectrum_type='linear', feat_dim=None, delta_delta=False, stride_ms=10.0, @@ -66,7 +66,7 @@ class SpeechFeaturizer(object): target_dB=-20, dither=1.0): self._audio_featurizer = AudioFeaturizer( - specgram_type=specgram_type, + spectrum_type=spectrum_type, feat_dim=feat_dim, delta_delta=delta_delta, stride_ms=stride_ms, diff --git a/deepspeech/io/__init__.py b/deepspeech/io/__init__.py index e180f18e..884e76e5 100644 --- a/deepspeech/io/__init__.py +++ b/deepspeech/io/__init__.py @@ -35,7 +35,7 @@ def create_dataloader(manifest_path, stride_ms=10.0, window_ms=20.0, max_freq=None, - specgram_type='linear', + spectrum_type='linear', feat_dim=None, delta_delta=False, use_dB_normalization=True, @@ -64,7 +64,7 @@ def create_dataloader(manifest_path, stride_ms=stride_ms, window_ms=window_ms, max_freq=max_freq, - specgram_type=specgram_type, + spectrum_type=spectrum_type, feat_dim=feat_dim, delta_delta=delta_delta, use_dB_normalization=use_dB_normalization, diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index fba5f7c6..fe53d8e3 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -63,7 +63,7 @@ class ManifestDataset(Dataset): n_fft=None, # fft points max_freq=None, # None for samplerate/2 raw_wav=True, # use raw_wav or kaldi feature - specgram_type='linear', # 'linear', 'mfcc', 'fbank' + spectrum_type='linear', # 'linear', 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank' delta_delta=False, # 'mfcc', 'fbank' dither=1.0, # feature dither @@ -124,7 +124,7 @@ class ManifestDataset(Dataset): n_fft=config.data.n_fft, max_freq=config.data.max_freq, target_sample_rate=config.data.target_sample_rate, - specgram_type=config.data.specgram_type, + spectrum_type=config.data.spectrum_type, feat_dim=config.data.feat_dim, delta_delta=config.data.delta_delta, dither=config.data.dither, @@ -152,7 +152,7 @@ class ManifestDataset(Dataset): n_fft=None, max_freq=None, target_sample_rate=16000, - specgram_type='linear', + spectrum_type='linear', feat_dim=None, delta_delta=False, dither=1.0, @@ -180,7 +180,7 @@ class ManifestDataset(Dataset): n_fft (int, optional): fft points for rfft. Defaults to None. max_freq (int, optional): max cut freq. Defaults to None. target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. - specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. + spectrum_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. use_dB_normalization (bool, optional): do dB normalization. Defaults to True. @@ -200,7 +200,7 @@ class ManifestDataset(Dataset): unit_type=unit_type, vocab_filepath=vocab_filepath, spm_model_prefix=spm_model_prefix, - specgram_type=specgram_type, + spectrum_type=spectrum_type, feat_dim=feat_dim, delta_delta=delta_delta, stride_ms=stride_ms, diff --git a/examples/aishell/s0/conf/deepspeech2.yaml b/examples/aishell/s0/conf/deepspeech2.yaml index 40f718f8..65531bc9 100644 --- a/examples/aishell/s0/conf/deepspeech2.yaml +++ b/examples/aishell/s0/conf/deepspeech2.yaml @@ -13,7 +13,7 @@ data: max_output_len: .inf min_output_input_ratio: 0.00 max_output_input_ratio: .inf - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/aishell/s0/local/data.sh b/examples/aishell/s0/local/data.sh index 2f09b14a..3f0ed0dc 100755 --- a/examples/aishell/s0/local/data.sh +++ b/examples/aishell/s0/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --stride_ms=10.0 \ --window_ms=20.0 \ diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 904624c3..3984a7fe 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -15,7 +15,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index b880f858..51bd1ad4 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -15,7 +15,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/aishell/s1/local/data.sh b/examples/aishell/s1/local/data.sh index c6abce3b..ed58bb6f 100755 --- a/examples/aishell/s1/local/data.sh +++ b/examples/aishell/s1/local/data.sh @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then num_workers=$(nproc) python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --stride_ms=10.0 \ diff --git a/examples/librispeech/s0/conf/deepspeech2.yaml b/examples/librispeech/s0/conf/deepspeech2.yaml index d1746bff..30178d2f 100644 --- a/examples/librispeech/s0/conf/deepspeech2.yaml +++ b/examples/librispeech/s0/conf/deepspeech2.yaml @@ -13,7 +13,7 @@ data: max_output_len: .inf min_output_input_ratio: 0.00 max_output_input_ratio: .inf - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/librispeech/s0/local/data.sh b/examples/librispeech/s0/local/data.sh index 921f1f49..8d09baf6 100755 --- a/examples/librispeech/s0/local/data.sh +++ b/examples/librispeech/s0/local/data.sh @@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=2000 \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ diff --git a/examples/librispeech/s1/conf/chunk_confermer.yaml b/examples/librispeech/s1/conf/chunk_confermer.yaml index ec945a18..db0d937c 100644 --- a/examples/librispeech/s1/conf/chunk_confermer.yaml +++ b/examples/librispeech/s1/conf/chunk_confermer.yaml @@ -16,7 +16,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index 3939ffc6..8441de9c 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -16,7 +16,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 8f8bf453..3cdde4a4 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -16,7 +16,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/conf/transformer.yaml b/examples/librispeech/s1/conf/transformer.yaml index 243b24aa..49baecf9 100644 --- a/examples/librispeech/s1/conf/transformer.yaml +++ b/examples/librispeech/s1/conf/transformer.yaml @@ -16,7 +16,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/librispeech/s1/local/data.sh b/examples/librispeech/s1/local/data.sh index fbdd17d5..96924e35 100755 --- a/examples/librispeech/s1/local/data.sh +++ b/examples/librispeech/s1/local/data.sh @@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.train.raw" \ --num_samples=-1 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index dd9ce51f..3f52da7f 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -13,7 +13,7 @@ data: max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 - specgram_type: linear + spectrum_type: linear target_sample_rate: 16000 max_freq: None n_fft: None diff --git a/examples/tiny/s0/local/data.sh b/examples/tiny/s0/local/data.sh index 727a3da9..fabf2e40 100755 --- a/examples/tiny/s0/local/data.sh +++ b/examples/tiny/s0/local/data.sh @@ -1,4 +1,4 @@ -#! /usr/bin/env bash +#!/bin/bash stage=-1 stop_stage=100 @@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ --num_samples=64 \ - --specgram_type="linear" \ + --spectrum_type="linear" \ --delta_delta=false \ --sample_rate=16000 \ --stride_ms=10.0 \ diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index 79006626..cc9a4525 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -16,7 +16,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index aa2b145a..da7341fe 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -16,7 +16,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index 3813daa0..b00da663 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -16,7 +16,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 35c11731..39f5e99b 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -16,7 +16,7 @@ data: min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 raw_wav: True # use raw_wav or kaldi feature - specgram_type: fbank #linear, mfcc, fbank + spectrum_type: fbank #linear, mfcc, fbank feat_dim: 80 delta_delta: False dither: 1.0 diff --git a/examples/tiny/s1/local/data.sh b/examples/tiny/s1/local/data.sh index deff91e0..5822dc92 100755 --- a/examples/tiny/s1/local/data.sh +++ b/examples/tiny/s1/local/data.sh @@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ --manifest_path="data/manifest.tiny.raw" \ --num_samples=64 \ - --specgram_type="fbank" \ + --spectrum_type="fbank" \ --feat_dim=80 \ --delta_delta=false \ --sample_rate=16000 \ -- GitLab