dataset.py 9.9 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
H
Hui Zhang 已提交
14 15
# Modified from espnet(https://github.com/espnet/espnet)
# Modified from wenet(https://github.com/wenet-e2e/wenet)
16
from typing import Optional
H
Hui Zhang 已提交
17 18

from paddle.io import Dataset
19
from yacs.config import CfgNode
H
Hui Zhang 已提交
20

21 22
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.utils.log import Log
H
Hui Zhang 已提交
23

H
Hui Zhang 已提交
24
__all__ = ["ManifestDataset", "TransformDataset"]
H
Hui Zhang 已提交
25

26 27
logger = Log(__name__).getlog()

H
Hui Zhang 已提交
28

29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
class TextDataset(Dataset):
    @classmethod
    def from_file(cls, file_path):
        dataset = cls(file_path)
        return dataset

    def __init__(self, file_path):
        self._manifest = []
        with open(file_path) as f:
            for line in f:
                self._manifest.append(line.strip())

    def __len__(self):
        return len(self._manifest)

    def __getitem__(self, idx):
        return self._manifest[idx]


H
Hui Zhang 已提交
48
class ManifestDataset(Dataset):
49 50 51 52 53 54 55 56 57 58
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                manifest="",
                max_input_len=27.0,
                min_input_len=0.0,
                max_output_len=float('inf'),
                min_output_len=0.0,
                max_output_input_ratio=float('inf'),
H
Haoxin Ma 已提交
59
                min_output_input_ratio=0.0, ))
60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76

        if config is not None:
            config.merge_from_other_cfg(default)
        return default

    @classmethod
    def from_config(cls, config):
        """Build a ManifestDataset object from a config.

        Args:
            config (yacs.config.CfgNode): configs object.

        Returns:
            ManifestDataset: dataet object.
        """
        assert 'manifest' in config.data
        assert config.data.manifest
H
Haoxin Ma 已提交
77

78 79 80 81 82 83 84
        dataset = cls(
            manifest_path=config.data.manifest,
            max_input_len=config.data.max_input_len,
            min_input_len=config.data.min_input_len,
            max_output_len=config.data.max_output_len,
            min_output_len=config.data.min_output_len,
            max_output_input_ratio=config.data.max_output_input_ratio,
H
Haoxin Ma 已提交
85
            min_output_input_ratio=config.data.min_output_input_ratio, )
86 87
        return dataset

H
Hui Zhang 已提交
88 89
    def __init__(self,
                 manifest_path,
90 91 92 93 94
                 max_input_len=float('inf'),
                 min_input_len=0.0,
                 max_output_len=float('inf'),
                 min_output_len=0.0,
                 max_output_input_ratio=float('inf'),
H
Haoxin Ma 已提交
95
                 min_output_input_ratio=0.0):
H
Hui Zhang 已提交
96 97 98 99
        """Manifest Dataset

        Args:
            manifest_path (str): manifest josn file path
H
Hui Zhang 已提交
100
            max_input_len ([type], optional): maximum output seq length,
H
Hui Zhang 已提交
101
                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
H
Hui Zhang 已提交
102
            min_input_len (float, optional): minimum input seq length,
H
Hui Zhang 已提交
103
                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
H
Hui Zhang 已提交
104
            max_output_len (float, optional): maximum input seq length,
H
Hui Zhang 已提交
105
                in modeling units. Defaults to 500.0.
H
Hui Zhang 已提交
106
            min_output_len (float, optional): minimum input seq length,
H
Hui Zhang 已提交
107
                in modeling units. Defaults to 0.0.
H
Hui Zhang 已提交
108
            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
H
Hui Zhang 已提交
109 110 111
                Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
                Defaults to 0.05.
H
Hui Zhang 已提交
112

H
Hui Zhang 已提交
113 114
        """
        super().__init__()
H
Haoxin Ma 已提交
115

H
Hui Zhang 已提交
116
        # read manifest
H
Haoxin Ma 已提交
117
        self._manifest = read_manifest(
H
Hui Zhang 已提交
118
            manifest_path=manifest_path,
119 120 121 122 123 124 125
            max_input_len=max_input_len,
            min_input_len=min_input_len,
            max_output_len=max_output_len,
            min_output_len=min_output_len,
            max_output_input_ratio=max_output_input_ratio,
            min_output_input_ratio=min_output_input_ratio)
        self._manifest.sort(key=lambda x: x["feat_shape"][0])
H
Hui Zhang 已提交
126 127 128 129 130

    def __len__(self):
        return len(self._manifest)

    def __getitem__(self, idx):
H
Hui Zhang 已提交
131
        return self._manifest[idx]
H
Hui Zhang 已提交
132 133 134 135 136 137 138


class TransformDataset(Dataset):
    """Transform Dataset.

    Args:
        data: list object from make_batchset
H
Hui Zhang 已提交
139 140
        converter: batch function
        reader: read data
H
Hui Zhang 已提交
141 142
    """

H
Hui Zhang 已提交
143
    def __init__(self, data, converter, reader):
H
Hui Zhang 已提交
144 145 146
        """Init function."""
        super().__init__()
        self.data = data
H
Hui Zhang 已提交
147 148
        self.converter = converter
        self.reader = reader
H
Hui Zhang 已提交
149 150 151 152 153 154 155

    def __len__(self):
        """Len function."""
        return len(self.data)

    def __getitem__(self, idx):
        """[] operator."""
H
Hui Zhang 已提交
156
        return self.converter([self.reader(self.data[idx], return_uttid=True)])
H
Hui Zhang 已提交
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209


class AudioDataset(Dataset):
    def __init__(self,
                 data_file,
                 max_length=10240,
                 min_length=0,
                 token_max_length=200,
                 token_min_length=1,
                 batch_type='static',
                 batch_size=1,
                 max_frames_in_batch=0,
                 sort=True,
                 raw_wav=True,
                 stride_ms=10):
        """Dataset for loading audio data.
        Attributes::
            data_file: input data file
                Plain text data file, each line contains following 7 fields,
                which is split by '\t':
                    utt:utt1
                    feat:tmp/data/file1.wav or feat:tmp/data/fbank.ark:30
                    feat_shape: 4.95(in seconds) or feat_shape:495,80(495 is in frames)
                    text:i love you
                    token: i <space> l o v e <space> y o u
                    tokenid: int id of this token
                    token_shape: M,N    # M is the number of token, N is vocab size
            max_length: drop utterance which is greater than max_length(10ms), unit 10ms.
            min_length: drop utterance which is less than min_length(10ms), unit 10ms.
            token_max_length: drop utterance which is greater than token_max_length,
                especially when use char unit for english modeling
            token_min_length: drop utterance which is less than token_max_length
            batch_type: static or dynamic, see max_frames_in_batch(dynamic)
            batch_size: number of utterances in a batch,
               it's for static batch size.
            max_frames_in_batch: max feature frames in a batch,
               when batch_type is dynamic, it's for dynamic batch size.
               Then batch_size is ignored, we will keep filling the
               batch until the total frames in batch up to max_frames_in_batch.
            sort: whether to sort all data, so the utterance with the same
               length could be filled in a same batch.
            raw_wav: use raw wave or extracted featute.
                if raw wave is used, dynamic waveform-level augmentation could be used
                and the feature is extracted by torchaudio.
                if extracted featute(e.g. by kaldi) is used, only feature-level
                augmentation such as specaug could be used.
        """
        assert batch_type in ['static', 'dynamic']
        # read manifest
        data = read_manifest(data_file)
        if sort:
            data = sorted(data, key=lambda x: x["feat_shape"][0])
        if raw_wav:
H
Hui Zhang 已提交
210 211 212 213 214 215
            path_suffix = data[0]['feat'].split(':')[0].splitext()[-1]
            assert path_suffix not in ('.ark', '.scp')
            # m second to n frame
            data = list(
                map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms),
                    data))
H
Hui Zhang 已提交
216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231

        self.input_dim = data[0]['feat_shape'][1]
        self.output_dim = data[0]['token_shape'][1]

        valid_data = []
        for i in range(len(data)):
            length = data[i]['feat_shape'][0]
            token_length = data[i]['token_shape'][0]
            # remove too lang or too short utt for both input and output
            # to prevent from out of memory
            if length > max_length or length < min_length:
                pass
            elif token_length > token_max_length or token_length < token_min_length:
                pass
            else:
                valid_data.append(data[i])
H
Hui Zhang 已提交
232
        logger.info(f"raw dataset len: {len(data)}")
H
Hui Zhang 已提交
233
        data = valid_data
H
Hui Zhang 已提交
234 235
        num_data = len(data)
        logger.info(f"dataset len after filter: {num_data}")
H
Hui Zhang 已提交
236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261

        self.minibatch = []
        # Dynamic batch size
        if batch_type == 'dynamic':
            assert (max_frames_in_batch > 0)
            self.minibatch.append([])
            num_frames_in_batch = 0
            for i in range(num_data):
                length = data[i]['feat_shape'][0]
                num_frames_in_batch += length
                if num_frames_in_batch > max_frames_in_batch:
                    self.minibatch.append([])
                    num_frames_in_batch = length
                self.minibatch[-1].append(data[i])
        # Static batch size
        else:
            cur = 0
            while cur < num_data:
                end = min(cur + batch_size, num_data)
                item = []
                for i in range(cur, end):
                    item.append(data[i])
                self.minibatch.append(item)
                cur = end

    def __len__(self):
H
Hui Zhang 已提交
262
        """number of example(batch)"""
H
Hui Zhang 已提交
263 264 265
        return len(self.minibatch)

    def __getitem__(self, idx):
H
Hui Zhang 已提交
266
        """batch example of idx"""
H
Hui Zhang 已提交
267
        return self.minibatch[idx]