add fbank, refactor feature of speech

177f463d · Hui Zhang · ed793b30 · 177f463d · 177f463d · 177f463d
5 changed file
--- a/deepspeech/frontend/augmentor/base.py
+++ b/deepspeech/frontend/augmentor/base.py
@@ -41,3 +41,17 @@ class AugmentorBase():
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
        pass
+    @abstractmethod
+    def transform_spectrogram(self, spec_segment):
+        """Adds various effects to the input spectrogram segment. Such effects
+        will augment the training data to make the model invariant to certain
+        types of time_mask or freq_mask in the real world, improving model's
+        generalization ability.
+        Note that this is an in-place transformation.
+        :param spec_segment: Spectrogram segment to add effects to.
+        :type spec_segment: Spectrogram
+        """
+        pass
\ No newline at end of file
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -17,6 +17,7 @@ import numpy as np
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.frontend.audio import AudioSegment
 from python_speech_features import mfcc
+from python_speech_features import logfbank
 from python_speech_features import delta
@@ -49,7 +50,9 @@ class AudioFeaturizer(object):
    """
    def __init__(self,
-                 specgram_type='linear',
+                 specgram_type: str='linear',
+                 feat_dim: int=None,
+                 delta_delta: bool=False,
                 stride_ms=10.0,
                 window_ms=20.0,
                 n_fft=None,
@@ -58,6 +61,8 @@ class AudioFeaturizer(object):
                 use_dB_normalization=True,
                 target_dB=-20):
        self._specgram_type = specgram_type
+        self._feat_dim = feat_dim
+        self._delta_delta = delta_delta
        self._stride_ms = stride_ms
        self._window_ms = window_ms
        self._max_freq = max_freq
@@ -110,7 +115,12 @@ class AudioFeaturizer(object):
                           1)
        elif self._specgram_type == 'mfcc':
            # mfcc, delta, delta-delta
-            feat_dim = int(13 * 3)
+            feat_dim = int(self._feat_dim *
+                           3) if self._delta_delta else int(self._feat_dim)
+        elif self._specgram_type == 'fbank':
+            # fbank, delta, delta-delta
+            feat_dim = int(self._feat_dim *
+                           3) if self._delta_delta else int(self._feat_dim)
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)
@@ -123,8 +133,23 @@ class AudioFeaturizer(object):
                samples, sample_rate, self._stride_ms, self._window_ms,
                self._max_freq)
        elif self._specgram_type == 'mfcc':
-            return self._compute_mfcc(samples, sample_rate, self._stride_ms,
+            return self._compute_mfcc(
-                                      self._window_ms, self._max_freq)
+                samples,
+                sample_rate,
+                self._stride_ms,
+                self._feat_dim,
+                self._window_ms,
+                self._max_freq,
+                delta_delta=self._delta_delta)
+        elif self._specgram_type == 'fbank':
+            return self._compute_fbank(
+                samples,
+                sample_rate,
+                self._stride_ms,
+                self._feat_dim,
+                self._window_ms,
+                self._max_freq,
+                delta_delta=self._delta_delta)
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)
@@ -179,13 +204,54 @@ class AudioFeaturizer(object):
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs
+    def _concat_delta_delta(self, feat):
+        """append delat, delta-delta feature.
+        Args:
+            feat (np.ndarray): (D, T)
+        Returns:
+            np.ndarray: feat with delta-delta, (3*D, T)
+        """
+        feat = np.transpose(feat)
+        # Deltas
+        d_feat = delta(feat, 2)
+        # Deltas-Deltas
+        dd_feat = delta(feat, 2)
+        # transpose
+        feat = np.transpose(feat)
+        d_feat = np.transpose(d_feat)
+        dd_feat = np.transpose(dd_feat)
+        # concat above three features
+        concat_feat = np.concatenate((feat, d_feat, dd_feat))
+        return concat_feat
    def _compute_mfcc(self,
                      samples,
                      sample_rate,
+                      feat_dim=13,
                      stride_ms=10.0,
                      window_ms=20.0,
-                      max_freq=None):
+                      max_freq=None,
-        """Compute mfcc from samples."""
+                      delta_delta=True):
+        """Compute mfcc from samples.
+        Args:
+            samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
+            sample_rate (float): the sample rate of the signal we are working with, in Hz.
+            feat_dim (int): the number of cepstrum to return, default 13.
+            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
+            window_ms (float, optional): window length in ms. Defaults to 20.0.
+            max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
+            delta_delta (bool, optional): Whether with delta delta. Defaults to False.
+        Raises:
+            ValueError: max_freq > samplerate/2
+            ValueError: stride_ms > window_ms
+        Returns:
+            np.ndarray: mfcc feature, (D, T).
+        """
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
@@ -195,22 +261,73 @@ class AudioFeaturizer(object):
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        # compute the 13 cepstral coefficients, and the first one is replaced
-        # by log(frame energy)
+        # by log(frame energy), (T, D)
        mfcc_feat = mfcc(
            signal=samples,
            samplerate=sample_rate,
            winlen=0.001 * window_ms,
            winstep=0.001 * stride_ms,
-            highfreq=max_freq)
+            numcep=feat_dim,
-        # Deltas
+            nfilt=2 * feat_dim,
-        d_mfcc_feat = delta(mfcc_feat, 2)
+            nfft=None,
-        # Deltas-Deltas
+            lowfreq=0,
-        dd_mfcc_feat = delta(d_mfcc_feat, 2)
+            highfreq=max_freq,
-        # transpose
+            preemph=0.97,
+            ceplifter=22,
+            appendEnergy=True,
+            winfunc=lambda x: np.ones((x, )))
        mfcc_feat = np.transpose(mfcc_feat)
-        d_mfcc_feat = np.transpose(d_mfcc_feat)
+        if delta_delta:
-        dd_mfcc_feat = np.transpose(dd_mfcc_feat)
+            mfcc_feat = self._concat_delta_delta(mfcc_feat)
-        # concat above three features
+        return mfcc_feat
-        concat_mfcc_feat = np.concatenate(
-            (mfcc_feat, d_mfcc_feat, dd_mfcc_feat))
+    def _compute_fbank(self,
-        return concat_mfcc_feat
+                       samples,
+                       sample_rate,
+                       feat_dim=26,
+                       stride_ms=10.0,
+                       window_ms=20.0,
+                       max_freq=None,
+                       delta_delta=False):
+        """Compute logfbank from samples.
+        Args:
+            samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
+            sample_rate (float): the sample rate of the signal we are working with, in Hz.
+            feat_dim (int): the number of cepstrum to return, default 13.
+            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
+            window_ms (float, optional): window length in ms. Defaults to 20.0.
+            max_freq (float, optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
+            delta_delta (bool, optional): Whether with delta delta. Defaults to False.
+        Raises:
+            ValueError: max_freq > samplerate/2
+            ValueError: stride_ms > window_ms
+        Returns:
+            np.ndarray: mfcc feature, (D, T).
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        #(T, D)
+        fbank_feat = logfbank(
+            signal=samples,
+            samplerate=sample_rate,
+            winlen=0.001 * window_ms,
+            winstep=0.001 * stride_ms,
+            nfilt=feat_dim,
+            nfft=512,
+            lowfreq=max_freq,
+            highfreq=None,
+            preemph=0.97,
+            winfunc=lambda x: np.ones((x, )))
+        fbank_feat = np.transpose(fbank_feat)
+        if delta_delta:
+            fbank_feat = self._concat_delta_delta(fbank_feat)
+        return fbank_feat
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -56,6 +56,8 @@ class SpeechFeaturizer(object):
                 vocab_filepath,
                 spm_model_prefix=None,
                 specgram_type='linear',
+                 feat_dim=13,
+                 delta_delta=True,
                 stride_ms=10.0,
                 window_ms=20.0,
                 n_fft=None,
@@ -65,6 +67,8 @@ class SpeechFeaturizer(object):
                 target_dB=-20):
        self._audio_featurizer = AudioFeaturizer(
            specgram_type=specgram_type,
+            feat_dim=feat_dim,
+            delta_delta=delta_delta,
            stride_ms=stride_ms,
            window_ms=window_ms,
            n_fft=n_fft,
@@ -82,17 +86,22 @@ class SpeechFeaturizer(object):
        2. For transcript parts, keep the original text or convert text string
           to a list of token indices in char-level.
-        :param audio_segment: Speech segment to extract features from.
+        Args:
-        :type audio_segment: SpeechSegment
+            speech_segment (SpeechSegment): Speech segment to extract features from.
-        :return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of
+            keep_transcription_text (bool): True, keep transcript text, False, token ids
-                 char-level token indices.
-        :rtype: tuple
+        Returns:
+            tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
        """
-        audio_feature = self._audio_featurizer.featurize(speech_segment)
+        spec_feature = self._audio_featurizer.featurize(speech_segment)
        if keep_transcription_text:
-            return audio_feature, speech_segment.transcript
+            return spec_feature, speech_segment.transcript
-        text_ids = self._text_featurizer.featurize(speech_segment.transcript)
+        if speech_segment.has_token:
-        return audio_feature, text_ids
+            text_ids = speech_segment.token_ids
+        else:
+            text_ids = self._text_featurizer.featurize(
+                speech_segment.transcript)
+        return spec_feature, text_ids
    @property
    def vocab_size(self):

--- a/deepspeech/frontend/speech.py
+++ b/deepspeech/frontend/speech.py
@@ -24,7 +24,12 @@ class SpeechSegment(AudioSegment):
        AudioSegment (AudioSegment): Audio Segment
    """
-    def __init__(self, samples, sample_rate, transcript):
+    def __init__(self,
+                 samples,
+                 sample_rate,
+                 transcript,
+                 tokens=None,
+                 token_ids=None):
        """Speech segment abstraction, a subclass of AudioSegment,
            with an additional transcript.
@@ -32,9 +37,13 @@ class SpeechSegment(AudioSegment):
            samples (ndarray.float32): Audio samples [num_samples x num_channels].
            sample_rate (int): Audio sample rate.
            transcript (str): Transcript text for the speech.
+            tokens (List[str], optinal): Transcript tokens for the speech.
+            token_ids (List[int], optional): Transcript token ids for the speech.
        """
        AudioSegment.__init__(self, samples, sample_rate)
        self._transcript = transcript
+        self._tokens = tokens
+        self._token_ids = token_ids
    def __eq__(self, other):
        """Return whether two objects are equal.
@@ -46,6 +55,11 @@ class SpeechSegment(AudioSegment):
            return False
        if self._transcript != other._transcript:
            return False
+        if self.has_token and other.has_token:
+            if self._tokens != other._tokens:
+                return False
+            if self._token_ids != other._token_ids:
+                return False
        return True
    def __ne__(self, other):
@@ -53,33 +67,39 @@ class SpeechSegment(AudioSegment):
        return not self.__eq__(other)
    @classmethod
-    def from_file(cls, filepath, transcript):
+    def from_file(cls, filepath, transcript, tokens=None, token_ids=None):
        """Create speech segment from audio file and corresponding transcript.
-        :param filepath: Filepath or file object to audio file.
+        Args:
-        :type filepath: str|file
+            filepath (str|file): Filepath or file object to audio file.
-        :param transcript: Transcript text for the speech.
+            transcript (str): Transcript text for the speech.
-        :type transript: str
+            tokens (List[str], optional): text tokens. Defaults to None.
-        :return: Speech segment instance.
+            token_ids (List[int], optional): text token ids. Defaults to None.
-        :rtype: SpeechSegment
+        Returns:
+            SpeechSegment: Speech segment instance.
        """
        audio = AudioSegment.from_file(filepath)
-        return cls(audio.samples, audio.sample_rate, transcript)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
    @classmethod
-    def from_bytes(cls, bytes, transcript):
+    def from_bytes(cls, bytes, transcript, tokens=None, token_ids=None):
        """Create speech segment from a byte string and corresponding
-        transcript.
+        Args:
-        :param bytes: Byte string containing audio samples.
+            filepath (str|file): Filepath or file object to audio file.
-        :type bytes: str
+            transcript (str): Transcript text for the speech.
-        :param transcript: Transcript text for the speech.
+            tokens (List[str], optional): text tokens. Defaults to None.
-        :type transript: str
+            token_ids (List[int], optional): text token ids. Defaults to None.
-        :return: Speech segment instance.
-        :rtype: Speech Segment
+        Returns:
+            SpeechSegment: Speech segment instance.
        """
        audio = AudioSegment.from_bytes(bytes)
-        return cls(audio.samples, audio.sample_rate, transcript)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
    @classmethod
    def concatenate(cls, *segments):
@@ -98,6 +118,8 @@ class SpeechSegment(AudioSegment):
            raise ValueError("No speech segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
        transcripts = ""
+        tokens = []
+        token_ids = []
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
@@ -106,11 +128,20 @@ class SpeechSegment(AudioSegment):
                raise TypeError("Only speech segments of the same type "
                                "instance can be concatenated.")
            transcripts += seg._transcript
+            if self.has_token:
+                tokens += seg._tokens
+                token_ids += seg._token_ids
        samples = np.concatenate([seg.samples for seg in segments])
-        return cls(samples, sample_rate, transcripts)
+        return cls(samples, sample_rate, transcripts, tokens, token_ids)
    @classmethod
-    def slice_from_file(cls, filepath, transcript, start=None, end=None):
+    def slice_from_file(cls,
+                        filepath,
+                        transcript,
+                        tokens=None,
+                        token_ids=None,
+                        start=None,
+                        end=None):
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.
@@ -132,28 +163,54 @@ class SpeechSegment(AudioSegment):
        :rtype: SpeechSegment
        """
        audio = AudioSegment.slice_from_file(filepath, start, end)
-        return cls(audio.samples, audio.sample_rate, transcript)
+        return cls(audio.samples, audio.sample_rate, transcript, tokens,
+                   token_ids)
    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent speech segment of the given duration and
        sample rate, transcript will be an empty string.
-        :param duration: Length of silence in seconds.
+        Args:
-        :type duration: float
+            duration (float): Length of silence in seconds.
-        :param sample_rate: Sample rate.
+            sample_rate (float): Sample rate.
-        :type sample_rate: float
-        :return: Silence of the given duration.
+        Returns:
-        :rtype: SpeechSegment
+            SpeechSegment: Silence of the given duration.
        """
        audio = AudioSegment.make_silence(duration, sample_rate)
        return cls(audio.samples, audio.sample_rate, "")
+    @property
+    def has_token(self):
+        if self._tokens or self._token_ids:
+            return True
+        return False
    @property
    def transcript(self):
        """Return the transcript text.
-        :return: Transcript text for the speech.
+        Returns:
-        :rtype: str
+            str: Transcript text for the speech.
        """
        return self._transcript
+    @property
+    def tokens(self):
+        """Return the transcript text tokens.
+        Returns:
+            List[str]: text tokens.
+        """
+        return self._tokens
+    @property
+    def token_ids(self):
+        """Return the transcript text token ids.
+        Returns:
+            List[int]: text token ids.
+        """
+        return self._token_ids
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -26,8 +26,14 @@ add_arg = functools.partial(add_arguments, argparser=parser)
 add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
 add_arg('specgram_type',    str,
        'linear',
-        "Audio feature type. Options: linear, mfcc.",
+        "Audio feature type. Options: linear, mfcc, fbank.",
-        choices=['linear', 'mfcc'])
+        choices=['linear', 'mfcc', 'fbank'])
+add_arg('feat_dim',    int,
+        13,
+        "Audio feature dim.")
+add_arg('delta_delta',    bool,
+        False,
+        "Audio feature with delta delta.")
 add_arg('manifest_path',    str,
        'data/librispeech/manifest.train',
        "Filepath of manifest to compute normalizer's mean and stddev.")
@@ -42,7 +48,10 @@ def main():
    print_arguments(args)
    augmentation_pipeline = AugmentationPipeline('{}')
-    audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type)
+    audio_featurizer = AudioFeaturizer(
+        specgram_type=args.specgram_type,
+        feat_dim=args.feat_dim,
+        delta_delta=args.delta_delta)
    def augment_and_featurize(audio_segment):
        augmentation_pipeline.transform_audio(audio_segment)