diff --git a/README.md b/README.md index 3b20bf4944393f9d48177f6fc39aa7f23498cbd7..a92b671cb5df385a0948df9be078808c7391e835 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,11 @@ python datasets/librispeech/librispeech.py --help python compute_mean_std.py ``` -`python compute_mean_std.py` computes mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing. +`python compute_mean_std.py` computes mean and stdandard deviation for audio features, and save them to a file with a default name `./mean_std.npz`. This file will be used in both training and inferencing. The default feature of audio data is power spectrum, currently the mfcc feature is also supported. To train and infer based on mfcc feature, you can regenerate this file by + +``` +python compute_mean_std.py --specgram_type mfcc +``` More help for arguments: diff --git a/compute_mean_std.py b/compute_mean_std.py index 9c301c93f6d2ce3ae099caa96830912f76ce6c58..0cc84e73022ecb1333b805457cace39adcc68ce4 100644 --- a/compute_mean_std.py +++ b/compute_mean_std.py @@ -10,6 +10,12 @@ from data_utils.featurizer.audio_featurizer import AudioFeaturizer parser = argparse.ArgumentParser( description='Computing mean and stddev for feature normalizer.') +parser.add_argument( + "--specgram_type", + default='linear', + type=str, + help="Feature type of audio data: 'linear' (power spectrum)" + " or 'mfcc'. (default: %(default)s)") parser.add_argument( "--manifest_path", default='datasets/manifest.train', @@ -39,7 +45,7 @@ args = parser.parse_args() def main(): augmentation_pipeline = AugmentationPipeline(args.augmentation_config) - audio_featurizer = AudioFeaturizer() + audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type) def augment_and_featurize(audio_segment): augmentation_pipeline.transform_audio(audio_segment) diff --git a/data_utils/featurizer/audio_featurizer.py b/data_utils/featurizer/audio_featurizer.py index 4b4d02c60f4193d753badae1aaa3b17ab3b7ea43..271e535b6a9f1cded27caf4f63adcc51abf3e835 100644 --- a/data_utils/featurizer/audio_featurizer.py +++ b/data_utils/featurizer/audio_featurizer.py @@ -6,13 +6,15 @@ from __future__ import print_function import numpy as np from data_utils import utils from data_utils.audio import AudioSegment +from python_speech_features import mfcc +from python_speech_features import delta class AudioFeaturizer(object): """Audio featurizer, for extracting features from audio contents of AudioSegment or SpeechSegment. - Currently, it only supports feature type of linear spectrogram. + Currently, it supports feature types of linear spectrogram and mfcc. :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str @@ -20,9 +22,10 @@ class AudioFeaturizer(object): :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float - :param max_freq: Used when specgram_type is 'linear', only FFT bins + :param max_freq: When specgram_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are - returned. + returned; when specgram_type is 'mfcc', max_feq is the + highest band edge of mel filters. :types max_freq: None|float :param target_sample_rate: Audio are resampled (if upsampling or downsampling is allowed) to this before @@ -91,6 +94,9 @@ class AudioFeaturizer(object): return self._compute_linear_specgram( samples, sample_rate, self._stride_ms, self._window_ms, self._max_freq) + elif self._specgram_type == 'mfcc': + return self._compute_mfcc(samples, sample_rate, self._stride_ms, + self._window_ms, self._max_freq) else: raise ValueError("Unknown specgram_type %s. " "Supported values: linear." % self._specgram_type) @@ -142,3 +148,39 @@ class AudioFeaturizer(object): # prepare fft frequency list freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) return fft, freqs + + def _compute_mfcc(self, + samples, + sample_rate, + stride_ms=10.0, + window_ms=20.0, + max_freq=None): + """Compute mfcc from samples.""" + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + # compute 13 cepstral coefficients, and the first one is replaced + # by log(frame energy) + mfcc_feat = mfcc( + signal=samples, + samplerate=sample_rate, + winlen=0.001 * window_ms, + winstep=0.001 * stride_ms, + highfreq=max_freq) + # Deltas + d_mfcc_feat = delta(mfcc_feat, 2) + # Deltas-Deltas + dd_mfcc_feat = delta(d_mfcc_feat, 2) + # concat above three features + concat_mfcc_feat = [ + np.concatenate((mfcc_feat[i], d_mfcc_feat[i], dd_mfcc_feat[i])) + for i in xrange(len(mfcc_feat)) + ] + # transpose to be consistent with the linear specgram situation + concat_mfcc_feat = np.transpose(concat_mfcc_feat) + return concat_mfcc_feat diff --git a/data_utils/featurizer/speech_featurizer.py b/data_utils/featurizer/speech_featurizer.py index 26283892e85beb8b41351fb2d1b876c6284da887..a947588db4a29d7d49b9650c2da28731259cc0e0 100644 --- a/data_utils/featurizer/speech_featurizer.py +++ b/data_utils/featurizer/speech_featurizer.py @@ -11,23 +11,24 @@ class SpeechFeaturizer(object): """Speech featurizer, for extracting features from both audio and transcript contents of SpeechSegment. - Currently, for audio parts, it only supports feature type of linear - spectrogram; for transcript parts, it only supports char-level tokenizing - and conversion into a list of token indices. Note that the token indexing - order follows the given vocabulary file. + Currently, for audio parts, it supports feature types of linear + spectrogram and mfcc; for transcript parts, it only supports char-level + tokenizing and conversion into a list of token indices. Note that the + token indexing order follows the given vocabulary file. :param vocab_filepath: Filepath to load vocabulary for token indices conversion. :type specgram_type: basestring - :param specgram_type: Specgram feature type. Options: 'linear'. + :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'. :type specgram_type: str :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float - :param max_freq: Used when specgram_type is 'linear', only FFT bins + :param max_freq: When specgram_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are - returned. + returned; when specgram_type is 'mfcc', max_freq is the + highest band edge of mel filters. :types max_freq: None|float :param target_sample_rate: Speech are resampled (if upsampling or downsampling is allowed) to this before diff --git a/data_utils/normalizer.py b/data_utils/normalizer.py index c123d25d20600140b47da1e93655b15c0053dfea..1f4aae9a0913f323480c46c2d449f9515a65bb7e 100644 --- a/data_utils/normalizer.py +++ b/data_utils/normalizer.py @@ -16,7 +16,7 @@ class FeatureNormalizer(object): if mean_std_filepath is provided (not None), the normalizer will directly initilize from the file. Otherwise, both manifest_path and featurize_func should be given for on-the-fly mean and stddev computing. - + :param mean_std_filepath: File containing the pre-computed mean and stddev. :type mean_std_filepath: None|basestring :param manifest_path: Manifest of instances for computing mean and stddev. diff --git a/requirements.txt b/requirements.txt index 2ae7d0895a3594059e995e20d106f7c30ef92568..721fa2811081e530a9cec3b2e403ad2372b59269 100755 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ wget==3.2 scipy==0.13.1 resampy==0.1.5 https://github.com/kpu/kenlm/archive/master.zip +python_speech_features diff --git a/train.py b/train.py index 3a2d0cad9ec9635c7e44e0149e426842a5e892b6..6481074c6e58f98f57f81c6e42480fa00a261bbe 100644 --- a/train.py +++ b/train.py @@ -53,6 +53,12 @@ parser.add_argument( default=True, type=distutils.util.strtobool, help="Use sortagrad or not. (default: %(default)s)") +parser.add_argument( + "--specgram_type", + default='linear', + type=str, + help="Feature type of audio data: 'linear' (power spectrum)" + " or 'mfcc'. (default: %(default)s)") parser.add_argument( "--max_duration", default=27.0, @@ -130,6 +136,7 @@ def train(): augmentation_config=args.augmentation_config, max_duration=args.max_duration, min_duration=args.min_duration, + specgram_type=args.specgram_type, num_threads=args.num_threads_data) train_generator = data_generator()