collator.py 7.8 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np

16 17 18
from deepspeech.frontend.utility import IGNORE_ID
from deepspeech.io.utility import pad_sequence
from deepspeech.utils.log import Log
H
Haoxin Ma 已提交
19 20 21 22 23 24
from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
from deepspeech.frontend.normalizer import FeatureNormalizer
from deepspeech.frontend.speech import SpeechSegment
import io
import time
25 26

__all__ = ["SpeechCollator"]
H
Hui Zhang 已提交
27

28
logger = Log(__name__).getlog()
H
Hui Zhang 已提交
29

H
Haoxin Ma 已提交
30 31
# namedtupe need global for pickle.
TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
H
Hui Zhang 已提交
32 33

class SpeechCollator():
H
Haoxin Ma 已提交
34
    def __init__(self, config, keep_transcription_text=True):
H
Hui Zhang 已提交
35 36 37 38
        """
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.

39
        if ``keep_transcription_text`` is False, text is token ids else is raw string.
H
Hui Zhang 已提交
40
        """
41
        self._keep_transcription_text = keep_transcription_text
H
Hui Zhang 已提交
42

H
Haoxin Ma 已提交
43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148
        if isinstance(config.data.augmentation_config, (str, bytes)):
            if config.data.augmentation_config:
                aug_file = io.open(
                    config.data.augmentation_config, mode='r', encoding='utf8')
            else:
                aug_file = io.StringIO(initial_value='{}', newline='')
        else:
            aug_file = config.data.augmentation_config
            assert isinstance(aug_file, io.StringIO)

        self._local_data = TarLocalData(tar2info={}, tar2object={}
        self._augmentation_pipeline = AugmentationPipeline(
            augmentation_config=aug_file.read(), 
            random_seed=config.data.random_seed)
        
        self._normalizer = FeatureNormalizer(
            config.data.mean_std_filepath) if config.data.mean_std_filepath else None

        self._stride_ms = config.data.stride_ms
        self._target_sample_rate = config.data.target_sample_rate

        self._speech_featurizer = SpeechFeaturizer(
            unit_type=config.data.unit_type,
            vocab_filepath=config.data.vocab_filepath,
            spm_model_prefix=config.data.spm_model_prefix,
            specgram_type=config.data.specgram_type,
            feat_dim=config.data.feat_dim,
            delta_delta=config.data.delta_delta,
            stride_ms=config.data.stride_ms,
            window_ms=config.data.window_ms,
            n_fft=config.data.n_fft,
            max_freq=config.data.max_freq,
            target_sample_rate=config.data.target_sample_rate,
            use_dB_normalization=config.data.use_dB_normalization,
            target_dB=config.data.target_dB,
            dither=config.data.dither)

    def _parse_tar(self, file):
        """Parse a tar file to get a tarfile object
        and a map containing tarinfoes
        """
        result = {}
        f = tarfile.open(file)
        for tarinfo in f.getmembers():
            result[tarinfo.name] = tarinfo
        return f, result

    def _subfile_from_tar(self, file):
        """Get subfile object from tar.

        It will return a subfile object from tar file
        and cached tar file info for next reading request.
        """
        tarpath, filename = file.split(':', 1)[1].split('#', 1)
        if 'tar2info' not in self._local_data.__dict__:
            self._local_data.tar2info = {}
        if 'tar2object' not in self._local_data.__dict__:
            self._local_data.tar2object = {}
        if tarpath not in self._local_data.tar2info:
            object, infoes = self._parse_tar(tarpath)
            self._local_data.tar2info[tarpath] = infoes
            self._local_data.tar2object[tarpath] = object
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])

    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.

        :param audio_file: Filepath or file object of audio file.
        :type audio_file: str | file
        :param transcript: Transcription text.
        :type transcript: str
        :return: Tuple of audio feature tensor and data of transcription part,
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
        start_time = time.time()
        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
            speech_segment = SpeechSegment.from_file(
                self._subfile_from_tar(audio_file), transcript)
        else:
            speech_segment = SpeechSegment.from_file(audio_file, transcript)
        load_wav_time = time.time() - start_time
        #logger.debug(f"load wav time: {load_wav_time}")

        # audio augment
        start_time = time.time()
        self._augmentation_pipeline.transform_audio(speech_segment)
        audio_aug_time = time.time() - start_time
        #logger.debug(f"audio augmentation time: {audio_aug_time}")

        start_time = time.time()
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
        feature_time = time.time() - start_time
        #logger.debug(f"audio & test feature time: {feature_time}")

        # specgram augment
        start_time = time.time()
        specgram = self._augmentation_pipeline.transform_feature(specgram)
        feature_aug_time = time.time() - start_time
        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
        return specgram, transcript_part

H
Hui Zhang 已提交
149
    def __call__(self, batch):
150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
        """batch examples

        Args:
            batch ([List]): batch is (audio, text)
                audio (np.ndarray) shape (D, T)
                text (List[int] or str): shape (U,)

        Returns:
            tuple(audio, text, audio_lens, text_lens): batched data.
                audio : (B, Tmax, D)
                audio_lens: (B)
                text : (B, Umax)
                text_lens: (B)
        """
        audios = []
H
Hui Zhang 已提交
165
        audio_lens = []
166 167
        texts = []
        text_lens = []
H
Haoxin Ma 已提交
168
        utts = []
H
Haoxin Ma 已提交
169
        for utt, audio, text in batch:
H
Haoxin Ma 已提交
170
            audio, text = self.process_utterance(audio, text)
H
Haoxin Ma 已提交
171 172
            #utt
            utts.append(utt)
H
Hui Zhang 已提交
173
            # audio
174
            audios.append(audio.T)  # [T, D]
H
Hui Zhang 已提交
175 176
            audio_lens.append(audio.shape[1])
            # text
177 178 179 180 181 182
            # for training, text is token ids
            # else text is string, convert to unicode ord
            tokens = []
            if self._keep_transcription_text:
                assert isinstance(text, str), (type(text), text)
                tokens = [ord(t) for t in text]
H
Hui Zhang 已提交
183
            else:
184 185 186 187 188
                tokens = text  # token ids
            tokens = tokens if isinstance(tokens, np.ndarray) else np.array(
                tokens, dtype=np.int64)
            texts.append(tokens)
            text_lens.append(tokens.shape[0])
H
Hui Zhang 已提交
189

190 191 192 193 194 195
        padded_audios = pad_sequence(
            audios, padding_value=0.0).astype(np.float32)  #[B, T, D]
        audio_lens = np.array(audio_lens).astype(np.int64)
        padded_texts = pad_sequence(
            texts, padding_value=IGNORE_ID).astype(np.int64)
        text_lens = np.array(text_lens).astype(np.int64)
H
Haoxin Ma 已提交
196
        return utts, padded_audios, audio_lens, padded_texts, text_lens