data.py 9.9 KB
Newer Older
1 2
"""Contains data generator for orgnaizing various audio data preprocessing
pipeline and offering data reader interface of PaddlePaddle requirements.
3 4 5 6 7 8 9 10 11 12 13
"""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random
import numpy as np
import paddle.v2 as paddle
from data_utils import utils
from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.speech_featurizer import SpeechFeaturizer
14
from data_utils.speech import SpeechSegment
15 16 17 18 19 20
from data_utils.normalizer import FeatureNormalizer


class DataGenerator(object):
    """
    DataGenerator provides basic audio data preprocessing pipeline, and offers
21
    data reader interfaces of PaddlePaddle requirements.
22

23 24
    :param vocab_filepath: Vocabulary filepath for indexing tokenized
                           transcripts.
25
    :type vocab_filepath: basestring
26 27 28 29 30 31 32
    :param mean_std_filepath: File containing the pre-computed mean and stddev.
    :type mean_std_filepath: None|basestring
    :param augmentation_config: Augmentation configuration in json string.
                                Details see AugmentationPipeline.__doc__.
    :type augmentation_config: str
    :param max_duration: Audio with duration (in seconds) greater than
                         this will be discarded.
33
    :type max_duration: float
34 35
    :param min_duration: Audio with duration (in seconds) smaller than
                         this will be discarded.
36 37 38
    :type min_duration: float
    :param stride_ms: Striding size (in milliseconds) for generating frames.
    :type stride_ms: float
39
    :param window_ms: Window size (in milliseconds) for generating frames.
40
    :type window_ms: float
41 42 43 44 45 46 47 48
    :param max_freq: Used when specgram_type is 'linear', only FFT bins
                     corresponding to frequencies between [0, max_freq] are
                     returned.
    :types max_freq: None|float
    :param specgram_type: Specgram feature type. Options: 'linear'.
    :type specgram_type: str
    :param random_seed: Random seed.
    :type random_seed: int
49 50 51 52 53 54 55 56 57 58 59
    """

    def __init__(self,
                 vocab_filepath,
                 mean_std_filepath,
                 augmentation_config='{}',
                 max_duration=float('inf'),
                 min_duration=0.0,
                 stride_ms=10.0,
                 window_ms=20.0,
                 max_freq=None,
60
                 specgram_type='linear',
61 62 63 64 65 66 67 68
                 random_seed=0):
        self._max_duration = max_duration
        self._min_duration = min_duration
        self._normalizer = FeatureNormalizer(mean_std_filepath)
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=augmentation_config, random_seed=random_seed)
        self._speech_featurizer = SpeechFeaturizer(
            vocab_filepath=vocab_filepath,
69
            specgram_type=specgram_type,
70 71
            stride_ms=stride_ms,
            window_ms=window_ms,
72
            max_freq=max_freq)
73 74 75 76 77 78
        self._rng = random.Random(random_seed)
        self._epoch = 0

    def batch_reader_creator(self,
                             manifest_path,
                             batch_size,
79
                             min_batch_size=1,
80 81 82 83 84
                             padding_to=-1,
                             flatten=False,
                             sortagrad=False,
                             batch_shuffle=False):
        """
85 86
        Batch data reader creator for audio data. Return a callable generator
        function to produce batches of data.
87
        
88 89
        Audio features within one batch will be padded with zeros to have the
        same shape, or a user-defined shape.
90

91
        :param manifest_path: Filepath of manifest for audio files.
92
        :type manifest_path: basestring
93
        :param batch_size: Number of instances in a batch.
94
        :type batch_size: int
95 96 97 98 99 100
        :param min_batch_size: Any batch with batch size smaller than this will
                               be discarded. (To be deprecated in the future.)
        :type min_batch_size: int
        :param padding_to:  If set -1, the maximun shape in the batch
                            will be used as the target shape for padding.
                            Otherwise, `padding_to` will be the target shape.
101
        :type padding_to: int
102
        :param flatten: If set True, audio features will be flatten to 1darray.
103
        :type flatten: bool
104 105
        :param sortagrad: If set True, sort the instances by audio duration
                          in the first epoch for speed up training.
106
        :type sortagrad: bool
107 108 109 110 111
        :param batch_shuffle: If set True, instances are batch-wise shuffled.
                              For more details, please see 
                              ``_batch_shuffle.__doc__``.
                              If sortagrad is True, batch_shuffle is disabled
                              for the first epoch.
112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
        :type batch_shuffle: bool
        :return: Batch reader function, producing batches of data when called.
        :rtype: callable
        """

        def batch_reader():
            # read manifest
            manifest = utils.read_manifest(
                manifest_path=manifest_path,
                max_duration=self._max_duration,
                min_duration=self._min_duration)
            # sort (by duration) or batch-wise shuffle the manifest
            if self._epoch == 0 and sortagrad:
                manifest.sort(key=lambda x: x["duration"])
            elif batch_shuffle:
                manifest = self._batch_shuffle(manifest, batch_size)
            # prepare batches
            instance_reader = self._instance_reader_creator(manifest)
            batch = []
            for instance in instance_reader():
                batch.append(instance)
                if len(batch) == batch_size:
                    yield self._padding_batch(batch, padding_to, flatten)
                    batch = []
136
            if len(batch) >= min_batch_size:
137 138 139 140 141 142 143
                yield self._padding_batch(batch, padding_to, flatten)
            self._epoch += 1

        return batch_reader

    @property
    def feeding(self):
144 145 146 147 148
        """Returns data reader's feeding dict.
        
        :return: Data feeding dict.
        :rtype: dict 
        """
149 150 151 152
        return {"audio_spectrogram": 0, "transcript_text": 1}

    @property
    def vocab_size(self):
153 154 155 156 157
        """Return the vocabulary size.

        :return: Vocabulary size.
        :rtype: int
        """
158 159 160 161
        return self._speech_featurizer.vocab_size

    @property
    def vocab_list(self):
162 163 164 165 166
        """Return the vocabulary in list.

        :return: Vocabulary in list.
        :rtype: list
        """
167 168 169
        return self._speech_featurizer.vocab_list

    def _process_utterance(self, filename, transcript):
170
        """Load, augment, featurize and normalize for speech data."""
171 172 173 174 175 176 177 178
        speech_segment = SpeechSegment.from_file(filename, transcript)
        self._augmentation_pipeline.transform_audio(speech_segment)
        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
        specgram = self._normalizer.apply(specgram)
        return specgram, text_ids

    def _instance_reader_creator(self, manifest):
        """
179 180
        Instance reader creator. Create a callable function to produce
        instances of data.
181

182 183
        Instance: a tuple of ndarray of audio spectrogram and a list of
        token indices for transcript.
184 185 186 187 188 189 190 191 192 193 194
        """

        def reader():
            for instance in manifest:
                yield self._process_utterance(instance["audio_filepath"],
                                              instance["text"])

        return reader

    def _padding_batch(self, batch, padding_to=-1, flatten=False):
        """
195 196
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.
197

198 199 200
        If ``padding_to`` is -1, the maximun shape in the batch will be used
        as the target shape for padding. Otherwise, `padding_to` will be the
        target shape (only refers to the second axis).
201

202
        If `flatten` is True, features will be flatten to 1darray.
203 204 205 206 207 208
        """
        new_batch = []
        # get target shape
        max_length = max([audio.shape[1] for audio, text in batch])
        if padding_to != -1:
            if padding_to < max_length:
209 210
                raise ValueError("If padding_to is not -1, it should be larger "
                                 "than any instance's shape in the batch")
211 212 213 214 215 216 217 218 219 220 221
            max_length = padding_to
        # padding
        for audio, text in batch:
            padded_audio = np.zeros([audio.shape[0], max_length])
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
            new_batch.append((padded_audio, text))
        return new_batch

    def _batch_shuffle(self, manifest, batch_size):
222 223
        """Put similarly-sized instances into minibatches for better efficiency
        and make a batch-wise shuffle.
224 225 226

        1. Sort the audio clips by duration.
        2. Generate a random number `k`, k in [0, batch_size).
227 228
        3. Randomly shift `k` instances in order to create different batches
           for different epochs. Create minibatches.
229 230
        4. Shuffle the minibatches.

231
        :param manifest: Manifest contents. List of dict.
232 233 234 235
        :type manifest: list
        :param batch_size: Batch size. This size is also used for generate
                           a random number for batch shuffle.
        :type batch_size: int
236
        :return: Batch shuffled mainifest.
237 238 239 240 241 242 243 244 245 246 247
        :rtype: list
        """
        manifest.sort(key=lambda x: x["duration"])
        shift_len = self._rng.randint(0, batch_size - 1)
        batch_manifest = zip(*[iter(manifest[shift_len:])] * batch_size)
        self._rng.shuffle(batch_manifest)
        batch_manifest = list(sum(batch_manifest, ()))
        res_len = len(manifest) - shift_len - len(batch_manifest)
        batch_manifest.extend(manifest[-res_len:])
        batch_manifest.extend(manifest[0:shift_len])
        return batch_manifest