"""Contains the speech featurizer class.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function from data_utils.featurizer.audio_featurizer import AudioFeaturizer from data_utils.featurizer.text_featurizer import TextFeaturizer class SpeechFeaturizer(object): """Speech featurizer, for extracting features from both audio and transcript contents of SpeechSegment. Currently, for audio parts, it only supports feature type of linear spectrogram; for transcript parts, it only supports char-level tokenizing and conversion into a list of token indices. Note that the token indexing order follows the given vocabulary file. :param vocab_filepath: Filepath to load vocabulary for token indices conversion. :type specgram_type: basestring :param specgram_type: Specgram feature type. Options: 'linear'. :type specgram_type: str :param stride_ms: Striding size (in milliseconds) for generating frames. :type stride_ms: float :param window_ms: Window size (in milliseconds) for generating frames. :type window_ms: float :param max_freq: Used when specgram_type is 'linear', only FFT bins corresponding to frequencies between [0, max_freq] are returned. :types max_freq: None|float """ def __init__(self, vocab_filepath, specgram_type='linear', stride_ms=10.0, window_ms=20.0, max_freq=None): self._audio_featurizer = AudioFeaturizer(specgram_type, stride_ms, window_ms, max_freq) self._text_featurizer = TextFeaturizer(vocab_filepath) def featurize(self, speech_segment): """Extract features for speech segment. 1. For audio parts, extract the audio features. 2. For transcript parts, convert text string to a list of token indices in char-level. :param audio_segment: Speech segment to extract features from. :type audio_segment: SpeechSegment :return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of char-level token indices. :rtype: tuple """ audio_feature = self._audio_featurizer.featurize(speech_segment) text_ids = self._text_featurizer.featurize(speech_segment.transcript) return audio_feature, text_ids @property def vocab_size(self): """Return the vocabulary size. :return: Vocabulary size. :rtype: int """ return self._text_featurizer.vocab_size @property def vocab_list(self): """Return the vocabulary in list. :return: Vocabulary in list. :rtype: list """ return self._text_featurizer.vocab_list