audio_featurizer.py 13.9 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
"""Contains the audio featurizer class."""
15 16

import numpy as np
H
Hui Zhang 已提交
17 18
from deepspeech.frontend.utility import read_manifest
from deepspeech.frontend.audio import AudioSegment
Y
Yibing Liu 已提交
19
from python_speech_features import mfcc
20
from python_speech_features import logfbank
Y
Yibing Liu 已提交
21
from python_speech_features import delta
22 23 24


class AudioFeaturizer(object):
25 26 27
    """Audio featurizer, for extracting features from audio contents of
    AudioSegment or SpeechSegment.

Y
Yibing Liu 已提交
28
    Currently, it supports feature types of linear spectrogram and mfcc.
29 30 31 32 33 34 35

    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for generating frames.
    :type window_ms: float
Y
Yibing Liu 已提交
36
    :param max_freq: When specgram_type is 'linear', only FFT bins
37
                     corresponding to frequencies between [0, max_freq] are
Y
Yibing Liu 已提交
38 39
                     returned; when specgram_type is 'mfcc', max_feq is the
                     highest band edge of mel filters.
40
    :types max_freq: None|float
41 42 43 44 45 46 47 48 49
    :param target_sample_rate: Audio are resampled (if upsampling or
                               downsampling is allowed) to this before
                               extracting spectrogram features.
    :type target_sample_rate: float
    :param use_dB_normalization: Whether to normalize the audio to a certain
                                 decibels before extracting the features.
    :type use_dB_normalization: bool
    :param target_dB: Target audio decibels for normalization.
    :type target_dB: float
50 51
    """

52
    def __init__(self,
53 54 55
                 specgram_type: str='linear',
                 feat_dim: int=None,
                 delta_delta: bool=False,
56 57
                 stride_ms=10.0,
                 window_ms=20.0,
H
Hui Zhang 已提交
58
                 n_fft=None,
59 60 61 62
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20):
63
        self._specgram_type = specgram_type
H
Hui Zhang 已提交
64
        # mfcc and fbank using `feat_dim`
65
        self._feat_dim = feat_dim
H
Hui Zhang 已提交
66
        # mfcc and fbank using `delta-delta`
67
        self._delta_delta = delta_delta
68 69 70
        self._stride_ms = stride_ms
        self._window_ms = window_ms
        self._max_freq = max_freq
71 72 73
        self._target_sample_rate = target_sample_rate
        self._use_dB_normalization = use_dB_normalization
        self._target_dB = target_dB
H
Hui Zhang 已提交
74
        self._fft_point = n_fft
75

76 77 78
    def featurize(self,
                  audio_segment,
                  allow_downsampling=True,
79
                  allow_upsampling=True):
80 81 82 83
        """Extract audio features from AudioSegment or SpeechSegment.

        :param audio_segment: Audio/speech segment to extract features from.
        :type audio_segment: AudioSegment|SpeechSegment
84 85 86 87 88 89
        :param allow_downsampling: Whether to allow audio downsampling before
                                   featurizing.
        :type allow_downsampling: bool
        :param allow_upsampling: Whether to allow audio upsampling before
                                 featurizing.
        :type allow_upsampling: bool
90 91
        :return: Spectrogram audio feature in 2darray.
        :rtype: ndarray
92
        :raises ValueError: If audio sample rate is not supported.
93
        """
94 95 96 97 98 99 100 101 102 103 104 105 106
        # upsampling or downsampling
        if ((audio_segment.sample_rate > self._target_sample_rate and
             allow_downsampling) or
            (audio_segment.sample_rate < self._target_sample_rate and
             allow_upsampling)):
            audio_segment.resample(self._target_sample_rate)
        if audio_segment.sample_rate != self._target_sample_rate:
            raise ValueError("Audio sample rate is not supported. "
                             "Turn allow_downsampling or allow up_sampling on.")
        # decibel normalization
        if self._use_dB_normalization:
            audio_segment.normalize(target_db=self._target_dB)
        # extract spectrogram
107 108 109
        return self._compute_specgram(audio_segment.samples,
                                      audio_segment.sample_rate)

H
Hui Zhang 已提交
110 111 112 113 114 115 116 117 118
    @property
    def feature_size(self):
        """audio feature size"""
        feat_dim = 0
        if self._specgram_type == 'linear':
            fft_point = self._window_ms if self._fft_point is None else self._fft_point
            feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 +
                           1)
        elif self._specgram_type == 'mfcc':
H
Hui Zhang 已提交
119
            # mfcc, delta, delta-delta
120 121 122 123 124 125
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
        elif self._specgram_type == 'fbank':
            # fbank, delta, delta-delta
            feat_dim = int(self._feat_dim *
                           3) if self._delta_delta else int(self._feat_dim)
H
Hui Zhang 已提交
126 127 128 129 130
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)
        return feat_dim

131
    def _compute_specgram(self, samples, sample_rate):
132
        """Extract various audio features."""
133 134
        if self._specgram_type == 'linear':
            return self._compute_linear_specgram(
H
Hui Zhang 已提交
135 136 137 138 139
                samples,
                sample_rate,
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq)
Y
Yibing Liu 已提交
140
        elif self._specgram_type == 'mfcc':
141 142 143
            return self._compute_mfcc(
                samples,
                sample_rate,
H
Hui Zhang 已提交
144 145 146 147
                feat_dim=self._feat_dim,
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq,
148 149 150 151 152
                delta_delta=self._delta_delta)
        elif self._specgram_type == 'fbank':
            return self._compute_fbank(
                samples,
                sample_rate,
H
Hui Zhang 已提交
153 154 155 156
                feat_dim=self._feat_dim,
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq,
157
                delta_delta=self._delta_delta)
158 159 160 161 162 163 164 165 166 167 168
        else:
            raise ValueError("Unknown specgram_type %s. "
                             "Supported values: linear." % self._specgram_type)

    def _compute_linear_specgram(self,
                                 samples,
                                 sample_rate,
                                 stride_ms=10.0,
                                 window_ms=20.0,
                                 max_freq=None,
                                 eps=1e-14):
169
        """Compute the linear spectrogram from FFT energy."""
170 171 172
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
L
lispc 已提交
173
            raise ValueError("max_freq must not be greater than half of "
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        stride_size = int(0.001 * sample_rate * stride_ms)
        window_size = int(0.001 * sample_rate * window_ms)
        specgram, freqs = self._specgram_real(
            samples,
            window_size=window_size,
            stride_size=stride_size,
            sample_rate=sample_rate)
        ind = np.where(freqs <= max_freq)[0][-1] + 1
        return np.log(specgram[:ind, :] + eps)

    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
189
        """Compute the spectrogram for samples from a real signal."""
190 191 192 193 194 195 196 197 198 199 200
        # extract strided windows
        truncate_size = (len(samples) - window_size) % stride_size
        samples = samples[:len(samples) - truncate_size]
        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
        windows = np.lib.stride_tricks.as_strided(
            samples, shape=nshape, strides=nstrides)
        assert np.all(
            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
        # window weighting, squared Fast Fourier Transform (fft), scaling
        weighting = np.hanning(window_size)[:, None]
H
Hui Zhang 已提交
201 202
        # https://numpy.org/doc/stable/reference/generated/numpy.fft.rfft.html
        fft = np.fft.rfft(windows * weighting, n=None, axis=0)
203 204
        fft = np.absolute(fft)
        fft = fft**2
205 206 207 208 209 210
        scale = np.sum(weighting**2) * sample_rate
        fft[1:-1, :] *= (2.0 / scale)
        fft[(0, -1), :] /= scale
        # prepare fft frequency list
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs
Y
Yibing Liu 已提交
211

212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
    def _concat_delta_delta(self, feat):
        """append delat, delta-delta feature.

        Args:
            feat (np.ndarray): (D, T)

        Returns:
            np.ndarray: feat with delta-delta, (3*D, T)
        """
        feat = np.transpose(feat)
        # Deltas
        d_feat = delta(feat, 2)
        # Deltas-Deltas
        dd_feat = delta(feat, 2)
        # transpose
        feat = np.transpose(feat)
        d_feat = np.transpose(d_feat)
        dd_feat = np.transpose(dd_feat)
        # concat above three features
        concat_feat = np.concatenate((feat, d_feat, dd_feat))
        return concat_feat

Y
Yibing Liu 已提交
234 235 236
    def _compute_mfcc(self,
                      samples,
                      sample_rate,
237
                      feat_dim=13,
Y
Yibing Liu 已提交
238 239
                      stride_ms=10.0,
                      window_ms=20.0,
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259
                      max_freq=None,
                      delta_delta=True):
        """Compute mfcc from samples.

        Args:
            samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
            sample_rate (float): the sample rate of the signal we are working with, in Hz.
            feat_dim (int): the number of cepstrum to return, default 13.
            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
            window_ms (float, optional): window length in ms. Defaults to 20.0.
            max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
            delta_delta (bool, optional): Whether with delta delta. Defaults to False.

        Raises:
            ValueError: max_freq > samplerate/2
            ValueError: stride_ms > window_ms

        Returns:
            np.ndarray: mfcc feature, (D, T).
        """
Y
Yibing Liu 已提交
260 261 262
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
Y
Yibing Liu 已提交
263
            raise ValueError("max_freq must not be greater than half of "
Y
Yibing Liu 已提交
264 265 266 267
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
Y
Yibing Liu 已提交
268
        # compute the 13 cepstral coefficients, and the first one is replaced
269
        # by log(frame energy), (T, D)
Y
Yibing Liu 已提交
270 271 272 273 274
        mfcc_feat = mfcc(
            signal=samples,
            samplerate=sample_rate,
            winlen=0.001 * window_ms,
            winstep=0.001 * stride_ms,
275 276 277 278 279 280 281 282 283
            numcep=feat_dim,
            nfilt=2 * feat_dim,
            nfft=None,
            lowfreq=0,
            highfreq=max_freq,
            preemph=0.97,
            ceplifter=22,
            appendEnergy=True,
            winfunc=lambda x: np.ones((x, )))
Y
Yibing Liu 已提交
284
        mfcc_feat = np.transpose(mfcc_feat)
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
        if delta_delta:
            mfcc_feat = self._concat_delta_delta(mfcc_feat)
        return mfcc_feat

    def _compute_fbank(self,
                       samples,
                       sample_rate,
                       feat_dim=26,
                       stride_ms=10.0,
                       window_ms=20.0,
                       max_freq=None,
                       delta_delta=False):
        """Compute logfbank from samples.
        
        Args:
            samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
            sample_rate (float): the sample rate of the signal we are working with, in Hz.
            feat_dim (int): the number of cepstrum to return, default 13.
            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
            window_ms (float, optional): window length in ms. Defaults to 20.0.
            max_freq (float, optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
            delta_delta (bool, optional): Whether with delta delta. Defaults to False.

        Raises:
            ValueError: max_freq > samplerate/2
            ValueError: stride_ms > window_ms

        Returns:
            np.ndarray: mfcc feature, (D, T).
        """
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must not be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        #(T, D)
        fbank_feat = logfbank(
            signal=samples,
            samplerate=sample_rate,
            winlen=0.001 * window_ms,
            winstep=0.001 * stride_ms,
            nfilt=feat_dim,
            nfft=512,
H
Hui Zhang 已提交
331 332
            lowfreq=0,
            highfreq=max_freq,
H
Hui Zhang 已提交
333
            preemph=0.97, )
334 335 336 337
        fbank_feat = np.transpose(fbank_feat)
        if delta_delta:
            fbank_feat = self._concat_delta_delta(fbank_feat)
        return fbank_feat