dataset.py 10.6 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
from typing import Optional
H
Hui Zhang 已提交
15 16

from paddle.io import Dataset
17
from yacs.config import CfgNode
H
Hui Zhang 已提交
18

19 20
from paddlespeech.s2t.frontend.utility import read_manifest
from paddlespeech.s2t.utils.log import Log
H
Hui Zhang 已提交
21

H
Hui Zhang 已提交
22
__all__ = ["ManifestDataset", "TransformDataset"]
H
Hui Zhang 已提交
23

24 25
logger = Log(__name__).getlog()

H
Hui Zhang 已提交
26

27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
class TextDataset(Dataset):
    @classmethod
    def from_file(cls, file_path):
        dataset = cls(file_path)
        return dataset

    def __init__(self, file_path):
        self._manifest = []
        with open(file_path) as f:
            for line in f:
                self._manifest.append(line.strip())

    def __len__(self):
        return len(self._manifest)

    def __getitem__(self, idx):
        return self._manifest[idx]


H
Hui Zhang 已提交
46
class ManifestDataset(Dataset):
47 48 49 50 51 52 53 54 55 56
    @classmethod
    def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
        default = CfgNode(
            dict(
                manifest="",
                max_input_len=27.0,
                min_input_len=0.0,
                max_output_len=float('inf'),
                min_output_len=0.0,
                max_output_input_ratio=float('inf'),
H
Haoxin Ma 已提交
57
                min_output_input_ratio=0.0, ))
58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74

        if config is not None:
            config.merge_from_other_cfg(default)
        return default

    @classmethod
    def from_config(cls, config):
        """Build a ManifestDataset object from a config.

        Args:
            config (yacs.config.CfgNode): configs object.

        Returns:
            ManifestDataset: dataet object.
        """
        assert 'manifest' in config.data
        assert config.data.manifest
H
Haoxin Ma 已提交
75

76 77 78 79 80 81 82
        dataset = cls(
            manifest_path=config.data.manifest,
            max_input_len=config.data.max_input_len,
            min_input_len=config.data.min_input_len,
            max_output_len=config.data.max_output_len,
            min_output_len=config.data.min_output_len,
            max_output_input_ratio=config.data.max_output_input_ratio,
H
Haoxin Ma 已提交
83
            min_output_input_ratio=config.data.min_output_input_ratio, )
84 85
        return dataset

H
Hui Zhang 已提交
86 87
    def __init__(self,
                 manifest_path,
88 89 90 91 92
                 max_input_len=float('inf'),
                 min_input_len=0.0,
                 max_output_len=float('inf'),
                 min_output_len=0.0,
                 max_output_input_ratio=float('inf'),
H
Haoxin Ma 已提交
93
                 min_output_input_ratio=0.0):
H
Hui Zhang 已提交
94 95 96 97
        """Manifest Dataset

        Args:
            manifest_path (str): manifest josn file path
H
Hui Zhang 已提交
98
            max_input_len ([type], optional): maximum output seq length,
H
Hui Zhang 已提交
99
                in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
H
Hui Zhang 已提交
100
            min_input_len (float, optional): minimum input seq length,
H
Hui Zhang 已提交
101
                in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
H
Hui Zhang 已提交
102
            max_output_len (float, optional): maximum input seq length,
H
Hui Zhang 已提交
103
                in modeling units. Defaults to 500.0.
H
Hui Zhang 已提交
104
            min_output_len (float, optional): minimum input seq length,
H
Hui Zhang 已提交
105
                in modeling units. Defaults to 0.0.
H
Hui Zhang 已提交
106
            max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio.
H
Hui Zhang 已提交
107 108 109
                Defaults to 10.0.
            min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio.
                Defaults to 0.05.
H
Hui Zhang 已提交
110

H
Hui Zhang 已提交
111 112
        """
        super().__init__()
H
Haoxin Ma 已提交
113

H
Hui Zhang 已提交
114
        # read manifest
H
Haoxin Ma 已提交
115
        self._manifest = read_manifest(
H
Hui Zhang 已提交
116
            manifest_path=manifest_path,
117 118 119 120 121 122 123
            max_input_len=max_input_len,
            min_input_len=min_input_len,
            max_output_len=max_output_len,
            min_output_len=min_output_len,
            max_output_input_ratio=max_output_input_ratio,
            min_output_input_ratio=min_output_input_ratio)
        self._manifest.sort(key=lambda x: x["feat_shape"][0])
H
Hui Zhang 已提交
124 125 126 127 128

    def __len__(self):
        return len(self._manifest)

    def __getitem__(self, idx):
H
Hui Zhang 已提交
129
        return self._manifest[idx]
H
Hui Zhang 已提交
130 131 132 133 134 135 136


class TransformDataset(Dataset):
    """Transform Dataset.

    Args:
        data: list object from make_batchset
H
Hui Zhang 已提交
137 138
        converter: batch function
        reader: read data
H
Hui Zhang 已提交
139 140
    """

H
Hui Zhang 已提交
141
    def __init__(self, data, converter, reader):
H
Hui Zhang 已提交
142 143 144
        """Init function."""
        super().__init__()
        self.data = data
H
Hui Zhang 已提交
145 146
        self.converter = converter
        self.reader = reader
H
Hui Zhang 已提交
147 148 149 150 151 152 153

    def __len__(self):
        """Len function."""
        return len(self.data)

    def __getitem__(self, idx):
        """[] operator."""
H
Hui Zhang 已提交
154
        return self.converter([self.reader(self.data[idx], return_uttid=True)])
H
Hui Zhang 已提交
155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280


class AudioDataset(Dataset):
    def __init__(self,
                 data_file,
                 max_length=10240,
                 min_length=0,
                 token_max_length=200,
                 token_min_length=1,
                 batch_type='static',
                 batch_size=1,
                 max_frames_in_batch=0,
                 sort=True,
                 raw_wav=True,
                 stride_ms=10):
        """Dataset for loading audio data.
        Attributes::
            data_file: input data file
                Plain text data file, each line contains following 7 fields,
                which is split by '\t':
                    utt:utt1
                    feat:tmp/data/file1.wav or feat:tmp/data/fbank.ark:30
                    feat_shape: 4.95(in seconds) or feat_shape:495,80(495 is in frames)
                    text:i love you
                    token: i <space> l o v e <space> y o u
                    tokenid: int id of this token
                    token_shape: M,N    # M is the number of token, N is vocab size
            max_length: drop utterance which is greater than max_length(10ms), unit 10ms.
            min_length: drop utterance which is less than min_length(10ms), unit 10ms.
            token_max_length: drop utterance which is greater than token_max_length,
                especially when use char unit for english modeling
            token_min_length: drop utterance which is less than token_max_length
            batch_type: static or dynamic, see max_frames_in_batch(dynamic)
            batch_size: number of utterances in a batch,
               it's for static batch size.
            max_frames_in_batch: max feature frames in a batch,
               when batch_type is dynamic, it's for dynamic batch size.
               Then batch_size is ignored, we will keep filling the
               batch until the total frames in batch up to max_frames_in_batch.
            sort: whether to sort all data, so the utterance with the same
               length could be filled in a same batch.
            raw_wav: use raw wave or extracted featute.
                if raw wave is used, dynamic waveform-level augmentation could be used
                and the feature is extracted by torchaudio.
                if extracted featute(e.g. by kaldi) is used, only feature-level
                augmentation such as specaug could be used.
        """
        assert batch_type in ['static', 'dynamic']
        # read manifest
        data = read_manifest(data_file)
        if sort:
            data = sorted(data, key=lambda x: x["feat_shape"][0])
        if raw_wav:
            assert data[0]['feat'].split(':')[0].splitext()[-1] not in ('.ark',
                                                                        '.scp')
            data = map(lambda x: (float(x['feat_shape'][0]) * 1000 / stride_ms))

        self.input_dim = data[0]['feat_shape'][1]
        self.output_dim = data[0]['token_shape'][1]

        # with open(data_file, 'r') as f:
        #     for line in f:
        #         arr = line.strip().split('\t')
        #         if len(arr) != 7:
        #             continue
        #         key = arr[0].split(':')[1]
        #         tokenid = arr[5].split(':')[1]
        #         output_dim = int(arr[6].split(':')[1].split(',')[1])
        #         if raw_wav:
        #             wav_path = ':'.join(arr[1].split(':')[1:])
        #             duration = int(float(arr[2].split(':')[1]) * 1000 / 10)
        #             data.append((key, wav_path, duration, tokenid))
        #         else:
        #             feat_ark = ':'.join(arr[1].split(':')[1:])
        #             feat_info = arr[2].split(':')[1].split(',')
        #             feat_dim = int(feat_info[1].strip())
        #             num_frames = int(feat_info[0].strip())
        #             data.append((key, feat_ark, num_frames, tokenid))
        #             self.input_dim = feat_dim
        #         self.output_dim = output_dim

        valid_data = []
        for i in range(len(data)):
            length = data[i]['feat_shape'][0]
            token_length = data[i]['token_shape'][0]
            # remove too lang or too short utt for both input and output
            # to prevent from out of memory
            if length > max_length or length < min_length:
                # logging.warn('ignore utterance {} feature {}'.format(
                #     data[i][0], length))
                pass
            elif token_length > token_max_length or token_length < token_min_length:
                pass
            else:
                valid_data.append(data[i])
        data = valid_data

        self.minibatch = []
        num_data = len(data)
        # Dynamic batch size
        if batch_type == 'dynamic':
            assert (max_frames_in_batch > 0)
            self.minibatch.append([])
            num_frames_in_batch = 0
            for i in range(num_data):
                length = data[i]['feat_shape'][0]
                num_frames_in_batch += length
                if num_frames_in_batch > max_frames_in_batch:
                    self.minibatch.append([])
                    num_frames_in_batch = length
                self.minibatch[-1].append(data[i])
        # Static batch size
        else:
            cur = 0
            while cur < num_data:
                end = min(cur + batch_size, num_data)
                item = []
                for i in range(cur, end):
                    item.append(data[i])
                self.minibatch.append(item)
                cur = end

    def __len__(self):
        return len(self.minibatch)

    def __getitem__(self, idx):
H
Hui Zhang 已提交
281
        return self.minibatch[idx]