提交 6f83be1a 编写于 作者: H Hui Zhang

fix spectrum_type typo

上级 c8a702e9
...@@ -32,7 +32,7 @@ _C.data = CN( ...@@ -32,7 +32,7 @@ _C.data = CN(
window_ms=20.0, # ms window_ms=20.0, # ms
n_fft=None, # fft points n_fft=None, # fft points
max_freq=None, # None for samplerate/2 max_freq=None, # None for samplerate/2
specgram_type='linear', # 'linear', 'mfcc', 'fbank' spectrum_type='linear', # 'linear', 'mfcc', 'fbank'
feat_dim=0, # 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank'
delat_delta=False, # 'mfcc', 'fbank' delat_delta=False, # 'mfcc', 'fbank'
target_sample_rate=16000, # target sample rate target_sample_rate=16000, # target sample rate
......
...@@ -24,15 +24,15 @@ class AudioFeaturizer(object): ...@@ -24,15 +24,15 @@ class AudioFeaturizer(object):
Currently, it supports feature types of linear spectrogram and mfcc. Currently, it supports feature types of linear spectrogram and mfcc.
:param specgram_type: Specgram feature type. Options: 'linear'. :param spectrum_type: Specgram feature type. Options: 'linear'.
:type specgram_type: str :type spectrum_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames. :param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float :type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames. :param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float :type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins :param max_freq: When spectrum_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_feq is the returned; when spectrum_type is 'mfcc', max_feq is the
highest band edge of mel filters. highest band edge of mel filters.
:types max_freq: None|float :types max_freq: None|float
:param target_sample_rate: Audio are resampled (if upsampling or :param target_sample_rate: Audio are resampled (if upsampling or
...@@ -47,7 +47,7 @@ class AudioFeaturizer(object): ...@@ -47,7 +47,7 @@ class AudioFeaturizer(object):
""" """
def __init__(self, def __init__(self,
specgram_type: str='linear', spectrum_type: str='linear',
feat_dim: int=None, feat_dim: int=None,
delta_delta: bool=False, delta_delta: bool=False,
stride_ms=10.0, stride_ms=10.0,
...@@ -58,7 +58,7 @@ class AudioFeaturizer(object): ...@@ -58,7 +58,7 @@ class AudioFeaturizer(object):
use_dB_normalization=True, use_dB_normalization=True,
target_dB=-20, target_dB=-20,
dither=1.0): dither=1.0):
self._specgram_type = specgram_type self._spectrum_type = spectrum_type
# mfcc and fbank using `feat_dim` # mfcc and fbank using `feat_dim`
self._feat_dim = feat_dim self._feat_dim = feat_dim
# mfcc and fbank using `delta-delta` # mfcc and fbank using `delta-delta`
...@@ -113,27 +113,27 @@ class AudioFeaturizer(object): ...@@ -113,27 +113,27 @@ class AudioFeaturizer(object):
def feature_size(self): def feature_size(self):
"""audio feature size""" """audio feature size"""
feat_dim = 0 feat_dim = 0
if self._specgram_type == 'linear': if self._spectrum_type == 'linear':
fft_point = self._window_ms if self._fft_point is None else self._fft_point fft_point = self._window_ms if self._fft_point is None else self._fft_point
feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 + feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
1) 1)
elif self._specgram_type == 'mfcc': elif self._spectrum_type == 'mfcc':
# mfcc, delta, delta-delta # mfcc, delta, delta-delta
feat_dim = int(self._feat_dim * feat_dim = int(self._feat_dim *
3) if self._delta_delta else int(self._feat_dim) 3) if self._delta_delta else int(self._feat_dim)
elif self._specgram_type == 'fbank': elif self._spectrum_type == 'fbank':
# fbank, delta, delta-delta # fbank, delta, delta-delta
feat_dim = int(self._feat_dim * feat_dim = int(self._feat_dim *
3) if self._delta_delta else int(self._feat_dim) 3) if self._delta_delta else int(self._feat_dim)
else: else:
raise ValueError("Unknown specgram_type %s. " raise ValueError("Unknown spectrum_type %s. "
"Supported values: linear." % self._specgram_type) "Supported values: linear." % self._spectrum_type)
return feat_dim return feat_dim
def _compute_specgram(self, audio_segment): def _compute_specgram(self, audio_segment):
"""Extract various audio features.""" """Extract various audio features."""
sample_rate = audio_segment.sample_rate sample_rate = audio_segment.sample_rate
if self._specgram_type == 'linear': if self._spectrum_type == 'linear':
samples = audio_segment.samples samples = audio_segment.samples
return self._compute_linear_specgram( return self._compute_linear_specgram(
samples, samples,
...@@ -141,7 +141,7 @@ class AudioFeaturizer(object): ...@@ -141,7 +141,7 @@ class AudioFeaturizer(object):
stride_ms=self._stride_ms, stride_ms=self._stride_ms,
window_ms=self._window_ms, window_ms=self._window_ms,
max_freq=self._max_freq) max_freq=self._max_freq)
elif self._specgram_type == 'mfcc': elif self._spectrum_type == 'mfcc':
samples = audio_segment.to('int16') samples = audio_segment.to('int16')
return self._compute_mfcc( return self._compute_mfcc(
samples, samples,
...@@ -152,7 +152,7 @@ class AudioFeaturizer(object): ...@@ -152,7 +152,7 @@ class AudioFeaturizer(object):
max_freq=self._max_freq, max_freq=self._max_freq,
dither=self._dither, dither=self._dither,
delta_delta=self._delta_delta) delta_delta=self._delta_delta)
elif self._specgram_type == 'fbank': elif self._spectrum_type == 'fbank':
samples = audio_segment.to('int16') samples = audio_segment.to('int16')
return self._compute_fbank( return self._compute_fbank(
samples, samples,
...@@ -164,8 +164,8 @@ class AudioFeaturizer(object): ...@@ -164,8 +164,8 @@ class AudioFeaturizer(object):
dither=self._dither, dither=self._dither,
delta_delta=self._delta_delta) delta_delta=self._delta_delta)
else: else:
raise ValueError("Unknown specgram_type %s. " raise ValueError("Unknown spectrum_type %s. "
"Supported values: linear." % self._specgram_type) "Supported values: linear." % self._spectrum_type)
def _compute_linear_specgram(self, def _compute_linear_specgram(self,
samples, samples,
......
...@@ -27,16 +27,16 @@ class SpeechFeaturizer(object): ...@@ -27,16 +27,16 @@ class SpeechFeaturizer(object):
:param vocab_filepath: Filepath to load vocabulary for token indices :param vocab_filepath: Filepath to load vocabulary for token indices
conversion. conversion.
:type specgram_type: str :type spectrum_type: str
:param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'. :param spectrum_type: Specgram feature type. Options: 'linear', 'mfcc'.
:type specgram_type: str :type spectrum_type: str
:param stride_ms: Striding size (in milliseconds) for generating frames. :param stride_ms: Striding size (in milliseconds) for generating frames.
:type stride_ms: float :type stride_ms: float
:param window_ms: Window size (in milliseconds) for generating frames. :param window_ms: Window size (in milliseconds) for generating frames.
:type window_ms: float :type window_ms: float
:param max_freq: When specgram_type is 'linear', only FFT bins :param max_freq: When spectrum_type is 'linear', only FFT bins
corresponding to frequencies between [0, max_freq] are corresponding to frequencies between [0, max_freq] are
returned; when specgram_type is 'mfcc', max_freq is the returned; when spectrum_type is 'mfcc', max_freq is the
highest band edge of mel filters. highest band edge of mel filters.
:types max_freq: None|float :types max_freq: None|float
:param target_sample_rate: Speech are resampled (if upsampling or :param target_sample_rate: Speech are resampled (if upsampling or
...@@ -54,7 +54,7 @@ class SpeechFeaturizer(object): ...@@ -54,7 +54,7 @@ class SpeechFeaturizer(object):
unit_type, unit_type,
vocab_filepath, vocab_filepath,
spm_model_prefix=None, spm_model_prefix=None,
specgram_type='linear', spectrum_type='linear',
feat_dim=None, feat_dim=None,
delta_delta=False, delta_delta=False,
stride_ms=10.0, stride_ms=10.0,
...@@ -66,7 +66,7 @@ class SpeechFeaturizer(object): ...@@ -66,7 +66,7 @@ class SpeechFeaturizer(object):
target_dB=-20, target_dB=-20,
dither=1.0): dither=1.0):
self._audio_featurizer = AudioFeaturizer( self._audio_featurizer = AudioFeaturizer(
specgram_type=specgram_type, spectrum_type=spectrum_type,
feat_dim=feat_dim, feat_dim=feat_dim,
delta_delta=delta_delta, delta_delta=delta_delta,
stride_ms=stride_ms, stride_ms=stride_ms,
......
...@@ -35,7 +35,7 @@ def create_dataloader(manifest_path, ...@@ -35,7 +35,7 @@ def create_dataloader(manifest_path,
stride_ms=10.0, stride_ms=10.0,
window_ms=20.0, window_ms=20.0,
max_freq=None, max_freq=None,
specgram_type='linear', spectrum_type='linear',
feat_dim=None, feat_dim=None,
delta_delta=False, delta_delta=False,
use_dB_normalization=True, use_dB_normalization=True,
...@@ -64,7 +64,7 @@ def create_dataloader(manifest_path, ...@@ -64,7 +64,7 @@ def create_dataloader(manifest_path,
stride_ms=stride_ms, stride_ms=stride_ms,
window_ms=window_ms, window_ms=window_ms,
max_freq=max_freq, max_freq=max_freq,
specgram_type=specgram_type, spectrum_type=spectrum_type,
feat_dim=feat_dim, feat_dim=feat_dim,
delta_delta=delta_delta, delta_delta=delta_delta,
use_dB_normalization=use_dB_normalization, use_dB_normalization=use_dB_normalization,
......
...@@ -63,7 +63,7 @@ class ManifestDataset(Dataset): ...@@ -63,7 +63,7 @@ class ManifestDataset(Dataset):
n_fft=None, # fft points n_fft=None, # fft points
max_freq=None, # None for samplerate/2 max_freq=None, # None for samplerate/2
raw_wav=True, # use raw_wav or kaldi feature raw_wav=True, # use raw_wav or kaldi feature
specgram_type='linear', # 'linear', 'mfcc', 'fbank' spectrum_type='linear', # 'linear', 'mfcc', 'fbank'
feat_dim=0, # 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank'
delta_delta=False, # 'mfcc', 'fbank' delta_delta=False, # 'mfcc', 'fbank'
dither=1.0, # feature dither dither=1.0, # feature dither
...@@ -124,7 +124,7 @@ class ManifestDataset(Dataset): ...@@ -124,7 +124,7 @@ class ManifestDataset(Dataset):
n_fft=config.data.n_fft, n_fft=config.data.n_fft,
max_freq=config.data.max_freq, max_freq=config.data.max_freq,
target_sample_rate=config.data.target_sample_rate, target_sample_rate=config.data.target_sample_rate,
specgram_type=config.data.specgram_type, spectrum_type=config.data.spectrum_type,
feat_dim=config.data.feat_dim, feat_dim=config.data.feat_dim,
delta_delta=config.data.delta_delta, delta_delta=config.data.delta_delta,
dither=config.data.dither, dither=config.data.dither,
...@@ -152,7 +152,7 @@ class ManifestDataset(Dataset): ...@@ -152,7 +152,7 @@ class ManifestDataset(Dataset):
n_fft=None, n_fft=None,
max_freq=None, max_freq=None,
target_sample_rate=16000, target_sample_rate=16000,
specgram_type='linear', spectrum_type='linear',
feat_dim=None, feat_dim=None,
delta_delta=False, delta_delta=False,
dither=1.0, dither=1.0,
...@@ -180,7 +180,7 @@ class ManifestDataset(Dataset): ...@@ -180,7 +180,7 @@ class ManifestDataset(Dataset):
n_fft (int, optional): fft points for rfft. Defaults to None. n_fft (int, optional): fft points for rfft. Defaults to None.
max_freq (int, optional): max cut freq. Defaults to None. max_freq (int, optional): max cut freq. Defaults to None.
target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. spectrum_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
use_dB_normalization (bool, optional): do dB normalization. Defaults to True. use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
...@@ -200,7 +200,7 @@ class ManifestDataset(Dataset): ...@@ -200,7 +200,7 @@ class ManifestDataset(Dataset):
unit_type=unit_type, unit_type=unit_type,
vocab_filepath=vocab_filepath, vocab_filepath=vocab_filepath,
spm_model_prefix=spm_model_prefix, spm_model_prefix=spm_model_prefix,
specgram_type=specgram_type, spectrum_type=spectrum_type,
feat_dim=feat_dim, feat_dim=feat_dim,
delta_delta=delta_delta, delta_delta=delta_delta,
stride_ms=stride_ms, stride_ms=stride_ms,
......
...@@ -13,7 +13,7 @@ data: ...@@ -13,7 +13,7 @@ data:
max_output_len: .inf max_output_len: .inf
min_output_input_ratio: 0.00 min_output_input_ratio: 0.00
max_output_input_ratio: .inf max_output_input_ratio: .inf
specgram_type: linear spectrum_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
......
...@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers=$(nproc) num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--specgram_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10.0 \
--window_ms=20.0 \ --window_ms=20.0 \
......
...@@ -15,7 +15,7 @@ data: ...@@ -15,7 +15,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -15,7 +15,7 @@ data: ...@@ -15,7 +15,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
num_workers=$(nproc) num_workers=$(nproc)
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--stride_ms=10.0 \ --stride_ms=10.0 \
......
...@@ -13,7 +13,7 @@ data: ...@@ -13,7 +13,7 @@ data:
max_output_len: .inf max_output_len: .inf
min_output_input_ratio: 0.00 min_output_input_ratio: 0.00
max_output_input_ratio: .inf max_output_input_ratio: .inf
specgram_type: linear spectrum_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
......
...@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -62,7 +62,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=2000 \ --num_samples=2000 \
--specgram_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10.0 \ --stride_ms=10.0 \
......
...@@ -16,7 +16,7 @@ data: ...@@ -16,7 +16,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -16,7 +16,7 @@ data: ...@@ -16,7 +16,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -16,7 +16,7 @@ data: ...@@ -16,7 +16,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -16,7 +16,7 @@ data: ...@@ -16,7 +16,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -68,7 +68,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \ --manifest_path="data/manifest.train.raw" \
--num_samples=-1 \ --num_samples=-1 \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
......
...@@ -13,7 +13,7 @@ data: ...@@ -13,7 +13,7 @@ data:
max_output_len: 400.0 max_output_len: 400.0
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
specgram_type: linear spectrum_type: linear
target_sample_rate: 16000 target_sample_rate: 16000
max_freq: None max_freq: None
n_fft: None n_fft: None
......
#! /usr/bin/env bash #!/bin/bash
stage=-1 stage=-1
stop_stage=100 stop_stage=100
...@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -46,7 +46,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \ --num_samples=64 \
--specgram_type="linear" \ --spectrum_type="linear" \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
--stride_ms=10.0 \ --stride_ms=10.0 \
......
...@@ -16,7 +16,7 @@ data: ...@@ -16,7 +16,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -16,7 +16,7 @@ data: ...@@ -16,7 +16,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -16,7 +16,7 @@ data: ...@@ -16,7 +16,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -16,7 +16,7 @@ data: ...@@ -16,7 +16,7 @@ data:
min_output_input_ratio: 0.05 min_output_input_ratio: 0.05
max_output_input_ratio: 10.0 max_output_input_ratio: 10.0
raw_wav: True # use raw_wav or kaldi feature raw_wav: True # use raw_wav or kaldi feature
specgram_type: fbank #linear, mfcc, fbank spectrum_type: fbank #linear, mfcc, fbank
feat_dim: 80 feat_dim: 80
delta_delta: False delta_delta: False
dither: 1.0 dither: 1.0
......
...@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -51,7 +51,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \ python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.tiny.raw" \ --manifest_path="data/manifest.tiny.raw" \
--num_samples=64 \ --num_samples=64 \
--specgram_type="fbank" \ --spectrum_type="fbank" \
--feat_dim=80 \ --feat_dim=80 \
--delta_delta=false \ --delta_delta=false \
--sample_rate=16000 \ --sample_rate=16000 \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册