audio_data_utils.py 14.8 KB
Newer Older
X
Xinghai Sun 已提交
1
"""
2 3
    Providing basic audio data preprocessing pipeline, and offering
    both instance-level and batch-level data reader interfaces.
X
Xinghai Sun 已提交
4
"""
5 6 7 8 9 10 11 12
import paddle.v2 as paddle
import logging
import json
import random
import soundfile
import numpy as np
import os

13
RANDOM_SEED = 0
14 15 16
logger = logging.getLogger(__name__)


17
class DataGenerator(object):
18
    """
19
    DataGenerator provides basic audio data preprocessing pipeline, and offers
20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
    both instance-level and batch-level data reader interfaces.
    Normalized FFT are used as audio features here.

    :param vocab_filepath: Vocabulary file path for indexing tokenized
                           transcriptions.
    :type vocab_filepath: basestring
    :param normalizer_manifest_path: Manifest filepath for collecting feature
                                     normalization statistics, e.g. mean, std.
    :type normalizer_manifest_path: basestring
    :param normalizer_num_samples: Number of instances sampled for collecting
                                   feature normalization statistics.
                                   Default is 100.
    :type normalizer_num_samples: int
    :param max_duration: Audio clips with duration (in seconds) greater than
                         this will be discarded. Default is 20.0.
    :type max_duration: float
    :param min_duration: Audio clips with duration (in seconds) smaller than
                         this will be discarded. Default is 0.0.
    :type min_duration: float
    :param stride_ms: Striding size (in milliseconds) for generating frames.
                      Default is 10.0. 
    :type stride_ms: float
    :param window_ms: Window size (in milliseconds) for frames. Default is 20.0.
    :type window_ms: float
    :param max_frequency: Maximun frequency for FFT features. FFT features of
                          frequency larger than this will be discarded.
                          If set None, all features will be kept.
                          Default is None.
    :type max_frequency: float
49 50
    """

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72
    def __init__(self,
                 vocab_filepath,
                 normalizer_manifest_path,
                 normalizer_num_samples=100,
                 max_duration=20.0,
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_frequency=None):
        self.__max_duration__ = max_duration
        self.__min_duration__ = min_duration
        self.__stride_ms__ = stride_ms
        self.__window_ms__ = window_ms
        self.__max_frequency__ = max_frequency
        self.__random__ = random.Random(RANDOM_SEED)
        # load vocabulary (dictionary)
        self.__vocab_dict__, self.__vocab_list__ = \
            self.__load_vocabulary_from_file__(vocab_filepath)
        # collect normalizer statistics
        self.__mean__, self.__std__ = self.__collect_normalizer_statistics__(
            manifest_path=normalizer_manifest_path,
            num_samples=normalizer_num_samples)
73

74 75 76 77 78 79
    def __audio_featurize__(self, audio_filename):
        """
        Preprocess audio data, including feature extraction, normalization etc..
        """
        features = self.__audio_basic_featurize__(audio_filename)
        return self.__normalize__(features)
80

81 82 83 84 85 86
    def __text_featurize__(self, text):
        """
        Preprocess text data, including tokenizing and token indexing etc..
        """
        return self.__convert_text_to_char_index__(
            text=text, vocabulary=self.__vocab_dict__)
87

88 89 90 91 92 93 94 95 96
    def __audio_basic_featurize__(self, audio_filename):
        """
        Compute basic (without normalization etc.) features for audio data.
        """
        return self.__spectrogram_from_file__(
            filename=audio_filename,
            stride_ms=self.__stride_ms__,
            window_ms=self.__window_ms__,
            max_freq=self.__max_frequency__)
97

98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
    def __collect_normalizer_statistics__(self, manifest_path, num_samples=100):
        """
        Compute feature normalization statistics, i.e. mean and stddev.
        """
        # read manifest
        manifest = self.__read_manifest__(
            manifest_path=manifest_path,
            max_duration=self.__max_duration__,
            min_duration=self.__min_duration__)
        # sample for statistics
        sampled_manifest = self.__random__.sample(manifest, num_samples)
        # extract spectrogram feature
        features = []
        for instance in sampled_manifest:
            spectrogram = self.__audio_basic_featurize__(
                instance["audio_filepath"])
            features.append(spectrogram)
        features = np.hstack(features)
        mean = np.mean(features, axis=1).reshape([-1, 1])
        std = np.std(features, axis=1).reshape([-1, 1])
        return mean, std
119

120 121 122 123 124
    def __normalize__(self, features, eps=1e-14):
        """
        Normalize features to be of zero mean and unit stddev.
        """
        return (features - self.__mean__) / (self.__std__ + eps)
125

126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155
    def __spectrogram_from_file__(self,
                                  filename,
                                  stride_ms=10.0,
                                  window_ms=20.0,
                                  max_freq=None,
                                  eps=1e-14):
        """
        Laod audio data and calculate the log of spectrogram by FFT.
        Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
        """
        audio, sample_rate = soundfile.read(filename)
        if audio.ndim >= 2:
            audio = np.mean(audio, 1)
        if max_freq is None:
            max_freq = sample_rate / 2
        if max_freq > sample_rate / 2:
            raise ValueError("max_freq must be greater than half of "
                             "sample rate.")
        if stride_ms > window_ms:
            raise ValueError("Stride size must not be greater than "
                             "window size.")
        stride_size = int(0.001 * sample_rate * stride_ms)
        window_size = int(0.001 * sample_rate * window_ms)
        spectrogram, freqs = self.__extract_spectrogram__(
            audio,
            window_size=window_size,
            stride_size=stride_size,
            sample_rate=sample_rate)
        ind = np.where(freqs <= max_freq)[0][-1] + 1
        return np.log(spectrogram[:ind, :] + eps)
156

157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181
    def __extract_spectrogram__(self, samples, window_size, stride_size,
                                sample_rate):
        """
        Compute the spectrogram by FFT for a discrete real signal.
        Refer to utils.py in https://github.com/baidu-research/ba-dls-deepspeech
        """
        # extract strided windows
        truncate_size = (len(samples) - window_size) % stride_size
        samples = samples[:len(samples) - truncate_size]
        nshape = (window_size, (len(samples) - window_size) // stride_size + 1)
        nstrides = (samples.strides[0], samples.strides[0] * stride_size)
        windows = np.lib.stride_tricks.as_strided(
            samples, shape=nshape, strides=nstrides)
        assert np.all(
            windows[:, 1] == samples[stride_size:(stride_size + window_size)])
        # window weighting, squared Fast Fourier Transform (fft), scaling
        weighting = np.hanning(window_size)[:, None]
        fft = np.fft.rfft(windows * weighting, axis=0)
        fft = np.absolute(fft)**2
        scale = np.sum(weighting**2) * sample_rate
        fft[1:-1, :] *= (2.0 / scale)
        fft[(0, -1), :] /= scale
        # prepare fft frequency list
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs
182

183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207
    def __load_vocabulary_from_file__(self, vocabulary_path):
        """
        Load vocabulary from file.
        """
        if not os.path.exists(vocabulary_path):
            raise ValueError("Vocabulary file %s not found.", vocabulary_path)
        vocab_lines = []
        with open(vocabulary_path, 'r') as file:
            vocab_lines.extend(file.readlines())
        vocab_list = [line[:-1] for line in vocab_lines]
        vocab_dict = dict(
            [(token, id) for (id, token) in enumerate(vocab_list)])
        return vocab_dict, vocab_list

    def __convert_text_to_char_index__(self, text, vocabulary):
        """
        Convert text string to a list of character index integers.
        """
        return [vocabulary[w] for w in text]

    def __read_manifest__(self, manifest_path, max_duration, min_duration):
        """
        Load and parse manifest file.
        """
        manifest = []
208 209 210 211 212 213 214
        for json_line in open(manifest_path):
            try:
                json_data = json.loads(json_line)
            except Exception as e:
                raise ValueError("Error reading manifest: %s" % str(e))
            if (json_data["duration"] <= max_duration and
                    json_data["duration"] >= min_duration):
215 216
                manifest.append(json_data)
        return manifest
217

218 219 220 221 222
    def __padding_batch__(self, batch, padding_to=-1, flatten=False):
        """
        Padding audio part of features (only in the time axis -- column axis)
        with zeros, to make each instance in the batch share the same
        audio feature shape.
223

224 225 226
        If `padding_to` is set -1, the maximun column numbers in the batch will
        be used as the target size. Otherwise, `padding_to` will be the target
        size. Default is -1.
227

228 229 230
        If `flatten` is set True, audio data will be flatten to be a 1-dim
        ndarray. Default is False.
        """
231
        new_batch = []
232 233 234 235 236 237 238
        # get target shape
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
                raise ValueError("If padding_to is not -1, it should be greater"
                                 " or equal to the original instance length.")
            max_length = padding_to
239 240
        # padding
        for audio, text in batch:
241 242
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
243 244 245 246 247
            if flatten:
                padded_audio = padded_audio.flatten()
            new_batch.append((padded_audio, text))
        return new_batch

248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374
    def instance_reader_creator(self,
                                manifest_path,
                                sort_by_duration=True,
                                shuffle=False):
        """
        Instance reader creator for audio data. Creat a callable function to
        produce instances of data.

        Instance: a tuple of a numpy ndarray of audio spectrogram and a list of
        tokenized and indexed transcription text.

        :param manifest_path: Filepath of manifest for audio clip files.
        :type manifest_path: basestring
        :param sort_by_duration: Sort the audio clips by duration if set True
                                 (for SortaGrad).
        :type sort_by_duration: bool
        :param shuffle: Shuffle the audio clips if set True.
        :type shuffle: bool
        :return: Data reader function.
        :rtype: callable
        """
        if sort_by_duration and shuffle:
            sort_by_duration = False
            logger.warn("When shuffle set to true, "
                        "sort_by_duration is forced to set False.")

        def reader():
            # read manifest
            manifest = self.__read_manifest__(
                manifest_path=manifest_path,
                max_duration=self.__max_duration__,
                min_duration=self.__min_duration__)
            # sort (by duration) or shuffle manifest
            if sort_by_duration:
                manifest.sort(key=lambda x: x["duration"])
            if shuffle:
                self.__random__.shuffle(manifest)
            # extract spectrogram feature
            for instance in manifest:
                spectrogram = self.__audio_featurize__(
                    instance["audio_filepath"])
                transcript = self.__text_featurize__(instance["text"])
                yield (spectrogram, transcript)

        return reader

    def batch_reader_creator(self,
                             manifest_path,
                             batch_size,
                             padding_to=-1,
                             flatten=False,
                             sort_by_duration=True,
                             shuffle=False):
        """
        Batch data reader creator for audio data. Creat a callable function to
        produce batches of data.
        
        Audio features will be padded with zeros to make each instance in the
        batch to share the same audio feature shape.

        :param manifest_path: Filepath of manifest for audio clip files.
        :type manifest_path: basestring
        :param batch_size: Instance number in a batch.
        :type batch_size: int
        :param padding_to:  If set -1, the maximun column numbers in the batch
                            will be used as the target size for padding.
                            Otherwise, `padding_to` will be the target size.
                            Default is -1.
        :type padding_to: int
        :param flatten: If set True, audio data will be flatten to be a 1-dim
                        ndarray. Otherwise, 2-dim ndarray. Default is False.
        :type flatten: bool
        :param sort_by_duration: Sort the audio clips by duration if set True
                                 (for SortaGrad).
        :type sort_by_duration: bool
        :param shuffle: Shuffle the audio clips if set True.
        :type shuffle: bool
        :return: Batch reader function, producing batches of data when called.
        :rtype: callable
        """

        def batch_reader():
            instance_reader = self.instance_reader_creator(
                manifest_path=manifest_path,
                sort_by_duration=sort_by_duration,
                shuffle=shuffle)
            batch = []
            for instance in instance_reader():
                batch.append(instance)
                if len(batch) == batch_size:
                    yield self.__padding_batch__(batch, padding_to, flatten)
                    batch = []
            if len(batch) > 0:
                yield self.__padding_batch__(batch, padding_to, flatten)

        return batch_reader

    def vocabulary_size(self):
        """
        Get vocabulary size.

        :return: Vocabulary size.
        :rtype: int
        """
        return len(self.__vocab_list__)

    def vocabulary_dict(self):
        """
        Get vocabulary in dict.

        :return: Vocabulary in dict.
        :rtype: dict
        """
        return self.__vocab_dict__

    def vocabulary_list(self):
        """
        Get vocabulary in list.

        :return: Vocabulary in list
        :rtype: list
        """
        return self.__vocab_list__

    def data_name_feeding(self):
        """
        Get feeddings (data field name and corresponding field id).
375

376 377 378 379 380 381 382 383
        :return: Feeding dict.
        :rtype: dict
        """
        feeding = {
            "audio_spectrogram": 0,
            "transcript_text": 1,
        }
        return feeding