reader.py 18.0 KB
Newer Older
G
guosheng 已提交
1
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
G
guosheng 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import glob
import six
import os
G
guosheng 已提交
18
import io
G
guosheng 已提交
19
import itertools
G
guosheng 已提交
20
from functools import partial
G
guosheng 已提交
21 22 23

import numpy as np
import paddle.fluid as fluid
G
guosheng 已提交
24 25
from paddle.fluid.dygraph.parallel import ParallelEnv
from paddle.fluid.io import BatchSampler, DataLoader, Dataset
G
guosheng 已提交
26 27


G
guosheng 已提交
28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73
def create_data_loader(args, device):
    data_loaders = [None, None]
    data_files = [args.training_file, args.validation_file
                  ] if args.validation_file else [args.training_file]
    for i, data_file in enumerate(data_files):
        dataset = Seq2SeqDataset(
            fpattern=data_file,
            src_vocab_fpath=args.src_vocab_fpath,
            trg_vocab_fpath=args.trg_vocab_fpath,
            token_delimiter=args.token_delimiter,
            start_mark=args.special_token[0],
            end_mark=args.special_token[1],
            unk_mark=args.special_token[2],
            byte_data=True)
        args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \
            args.unk_idx = dataset.get_vocab_summary()
        batch_sampler = Seq2SeqBatchSampler(
            dataset=dataset,
            use_token_batch=args.use_token_batch,
            batch_size=args.batch_size,
            pool_size=args.pool_size,
            sort_type=args.sort_type,
            shuffle=args.shuffle,
            shuffle_batch=args.shuffle_batch,
            max_length=args.max_length,
            distribute_mode=True
            if i == 0 else False)  # every device eval all data
        data_loader = DataLoader(
            dataset=dataset,
            batch_sampler=batch_sampler,
            places=device,
            collate_fn=partial(
                prepare_train_input,
                bos_idx=args.bos_idx,
                eos_idx=args.eos_idx,
                src_pad_idx=args.eos_idx,
                trg_pad_idx=args.eos_idx,
                n_head=args.n_head),
            num_workers=0,  # TODO: use multi-process
            return_list=True)
        data_loaders[i] = data_loader
    return data_loaders


def prepare_train_input(insts, bos_idx, eos_idx, src_pad_idx, trg_pad_idx,
                        n_head):
G
guosheng 已提交
74 75 76 77
    """
    Put all padded data needed by training into a list.
    """
    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
G
guosheng 已提交
78 79 80 81
        [inst[0] + [eos_idx] for inst in insts],
        src_pad_idx,
        n_head,
        is_target=False)
G
guosheng 已提交
82 83 84
    src_word = src_word.reshape(-1, src_max_len)
    src_pos = src_pos.reshape(-1, src_max_len)
    trg_word, trg_pos, trg_slf_attn_bias, trg_max_len = pad_batch_data(
G
guosheng 已提交
85 86 87 88
        [[bos_idx] + inst[1] for inst in insts],
        trg_pad_idx,
        n_head,
        is_target=True)
G
guosheng 已提交
89 90
    trg_word = trg_word.reshape(-1, trg_max_len)
    trg_pos = trg_pos.reshape(-1, trg_max_len)
G
guosheng 已提交
91

G
guosheng 已提交
92 93 94 95
    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                [1, 1, trg_max_len, 1]).astype("float32")

    lbl_word, lbl_weight, num_token = pad_batch_data(
G
guosheng 已提交
96
        [inst[1] + [eos_idx] for inst in insts],
G
guosheng 已提交
97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
        trg_pad_idx,
        n_head,
        is_target=False,
        is_label=True,
        return_attn_bias=False,
        return_max_len=False,
        return_num_token=True)
    lbl_word = lbl_word.reshape(-1, 1)
    lbl_weight = lbl_weight.reshape(-1, 1)

    data_inputs = [
        src_word, src_pos, src_slf_attn_bias, trg_word, trg_pos,
        trg_slf_attn_bias, trg_src_attn_bias, lbl_word, lbl_weight
    ]

    return data_inputs


115
def prepare_infer_input(insts, bos_idx, eos_idx, src_pad_idx, n_head):
G
guosheng 已提交
116 117 118 119
    """
    Put all padded data needed by beam search decoder into a list.
    """
    src_word, src_pos, src_slf_attn_bias, src_max_len = pad_batch_data(
120 121 122 123
        [inst[0] + [eos_idx] for inst in insts],
        src_pad_idx,
        n_head,
        is_target=False)
G
guosheng 已提交
124 125 126 127 128
    trg_src_attn_bias = np.tile(src_slf_attn_bias[:, :, ::src_max_len, :],
                                [1, 1, 1, 1]).astype("float32")
    src_word = src_word.reshape(-1, src_max_len)
    src_pos = src_pos.reshape(-1, src_max_len)

G
guosheng 已提交
129
    data_inputs = [src_word, src_pos, src_slf_attn_bias, trg_src_attn_bias]
G
guosheng 已提交
130
    return data_inputs
G
guosheng 已提交
131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165


def pad_batch_data(insts,
                   pad_idx,
                   n_head,
                   is_target=False,
                   is_label=False,
                   return_attn_bias=True,
                   return_max_len=True,
                   return_num_token=False):
    """
    Pad the instances to the max sequence length in batch, and generate the
    corresponding position data and attention bias.
    """
    return_list = []
    max_len = max(len(inst) for inst in insts)
    # Any token included in dict can be used to pad, since the paddings' loss
    # will be masked out by weights and make no effect on parameter gradients.
    inst_data = np.array(
        [inst + [pad_idx] * (max_len - len(inst)) for inst in insts])
    return_list += [inst_data.astype("int64").reshape([-1, 1])]
    if is_label:  # label weight
        inst_weight = np.array([[1.] * len(inst) + [0.] * (max_len - len(inst))
                                for inst in insts])
        return_list += [inst_weight.astype("float32").reshape([-1, 1])]
    else:  # position data
        inst_pos = np.array([
            list(range(0, len(inst))) + [0] * (max_len - len(inst))
            for inst in insts
        ])
        return_list += [inst_pos.astype("int64").reshape([-1, 1])]
    if return_attn_bias:
        if is_target:
            # This is used to avoid attention on paddings and subsequent
            # words.
G
guosheng 已提交
166 167
            slf_attn_bias_data = np.ones(
                (inst_data.shape[0], max_len, max_len))
G
guosheng 已提交
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190
            slf_attn_bias_data = np.triu(slf_attn_bias_data,
                                         1).reshape([-1, 1, max_len, max_len])
            slf_attn_bias_data = np.tile(slf_attn_bias_data,
                                         [1, n_head, 1, 1]) * [-1e9]
        else:
            # This is used to avoid attention on paddings.
            slf_attn_bias_data = np.array([[0] * len(inst) + [-1e9] *
                                           (max_len - len(inst))
                                           for inst in insts])
            slf_attn_bias_data = np.tile(
                slf_attn_bias_data.reshape([-1, 1, 1, max_len]),
                [1, n_head, max_len, 1])
        return_list += [slf_attn_bias_data.astype("float32")]
    if return_max_len:
        return_list += [max_len]
    if return_num_token:
        num_token = 0
        for inst in insts:
            num_token += len(inst)
        return_list += [num_token]
    return return_list if len(return_list) > 1 else return_list[0]


191 192 193 194 195 196 197
class SortType(object):
    GLOBAL = 'global'
    POOL = 'pool'
    NONE = "none"


class Converter(object):
G
guosheng 已提交
198
    def __init__(self, vocab, beg, end, unk, delimiter, add_beg, add_end):
199 200 201 202 203 204
        self._vocab = vocab
        self._beg = beg
        self._end = end
        self._unk = unk
        self._delimiter = delimiter
        self._add_beg = add_beg
G
guosheng 已提交
205
        self._add_end = add_end
206 207 208 209 210

    def __call__(self, sentence):
        return ([self._beg] if self._add_beg else []) + [
            self._vocab.get(w, self._unk)
            for w in sentence.split(self._delimiter)
G
guosheng 已提交
211
        ] + ([self._end] if self._add_end else [])
212 213 214 215 216 217


class ComposedConverter(object):
    def __init__(self, converters):
        self._converters = converters

G
guosheng 已提交
218
    def __call__(self, fields):
219
        return [
G
guosheng 已提交
220 221
            converter(field)
            for field, converter in zip(fields, self._converters)
222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
        ]


class SentenceBatchCreator(object):
    def __init__(self, batch_size):
        self.batch = []
        self._batch_size = batch_size

    def append(self, info):
        self.batch.append(info)
        if len(self.batch) == self._batch_size:
            tmp = self.batch
            self.batch = []
            return tmp


class TokenBatchCreator(object):
    def __init__(self, batch_size):
        self.batch = []
        self.max_len = -1
        self._batch_size = batch_size

    def append(self, info):
        cur_len = info.max_len
        max_len = max(self.max_len, cur_len)
        if max_len * (len(self.batch) + 1) > self._batch_size:
            result = self.batch
            self.batch = [info]
            self.max_len = cur_len
            return result
        else:
            self.max_len = max_len
            self.batch.append(info)


class SampleInfo(object):
G
guosheng 已提交
258
    def __init__(self, i, lens):
259
        self.i = i
G
guosheng 已提交
260 261 262
        # take bos and eos into account
        self.min_len = min(lens[0] + 1, lens[1] + 2)
        self.max_len = max(lens[0] + 1, lens[1] + 2)
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281


class MinMaxFilter(object):
    def __init__(self, max_len, min_len, underlying_creator):
        self._min_len = min_len
        self._max_len = max_len
        self._creator = underlying_creator

    def append(self, info):
        if info.max_len > self._max_len or info.min_len < self._min_len:
            return
        else:
            return self._creator.append(info)

    @property
    def batch(self):
        return self._creator.batch


G
guosheng 已提交
282 283 284 285 286 287 288 289 290 291
class Seq2SeqDataset(Dataset):
    def __init__(self,
                 src_vocab_fpath,
                 trg_vocab_fpath,
                 fpattern,
                 field_delimiter="\t",
                 token_delimiter=" ",
                 start_mark="<s>",
                 end_mark="<e>",
                 unk_mark="<unk>",
G
guosheng 已提交
292 293 294 295 296 297 298 299 300 301 302 303 304
                 trg_fpattern=None,
                 byte_data=False):
        if byte_data:
            # The WMT16 bpe data used here seems including bytes can not be
            # decoded by utf8. Thus convert str to bytes, and use byte data
            field_delimiter = field_delimiter.encode("utf8")
            token_delimiter = token_delimiter.encode("utf8")
            start_mark = start_mark.encode("utf8")
            end_mark = end_mark.encode("utf8")
            unk_mark = unk_mark.encode("utf8")
        self._byte_data = byte_data
        self._src_vocab = self.load_dict(src_vocab_fpath, byte_data=byte_data)
        self._trg_vocab = self.load_dict(trg_vocab_fpath, byte_data=byte_data)
G
guosheng 已提交
305 306 307 308 309
        self._bos_idx = self._src_vocab[start_mark]
        self._eos_idx = self._src_vocab[end_mark]
        self._unk_idx = self._src_vocab[unk_mark]
        self._field_delimiter = field_delimiter
        self._token_delimiter = token_delimiter
G
guosheng 已提交
310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331
        self.load_src_trg_ids(fpattern, trg_fpattern)

    def load_src_trg_ids(self, fpattern, trg_fpattern=None):
        src_converter = Converter(
            vocab=self._src_vocab,
            beg=self._bos_idx,
            end=self._eos_idx,
            unk=self._unk_idx,
            delimiter=self._token_delimiter,
            add_beg=False,
            add_end=False)

        trg_converter = Converter(
            vocab=self._trg_vocab,
            beg=self._bos_idx,
            end=self._eos_idx,
            unk=self._unk_idx,
            delimiter=self._token_delimiter,
            add_beg=False,
            add_end=False)

        converters = ComposedConverter([src_converter, trg_converter])
G
guosheng 已提交
332

G
guosheng 已提交
333
        self._src_seq_ids = []
G
guosheng 已提交
334
        self._trg_seq_ids = []
G
guosheng 已提交
335
        self._sample_infos = []
G
guosheng 已提交
336

G
guosheng 已提交
337 338 339 340 341 342 343
        slots = [self._src_seq_ids, self._trg_seq_ids]
        for i, line in enumerate(self._load_lines(fpattern, trg_fpattern)):
            lens = []
            for field, slot in zip(converters(line), slots):
                slot.append(field)
                lens.append(len(field))
            self._sample_infos.append(SampleInfo(i, lens))
G
guosheng 已提交
344

G
guosheng 已提交
345
    def _load_lines(self, fpattern, trg_fpattern=None):
G
guosheng 已提交
346
        fpaths = glob.glob(fpattern)
G
guosheng 已提交
347
        fpaths = sorted(fpaths)  # TODO: Add custum sort
G
guosheng 已提交
348
        assert len(fpaths) > 0, "no matching file to the provided data path"
G
guosheng 已提交
349

G
guosheng 已提交
350 351 352 353
        (f_mode, f_encoding,
         endl) = ("rb", None, b"\n") if self._byte_data else ("r", "utf8",
                                                              "\n")
        if trg_fpattern is None:
G
guosheng 已提交
354
            for fpath in fpaths:
G
guosheng 已提交
355
                with io.open(fpath, f_mode, encoding=f_encoding) as f:
G
guosheng 已提交
356
                    for line in f:
G
guosheng 已提交
357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375
                        fields = line.strip(endl).split(self._field_delimiter)
                        yield fields
        else:
            # separated source and target language data files
            # assume we can get aligned data by sort the two language files
            # TODO: Need more rigorous check
            trg_fpaths = glob.glob(trg_fpattern)
            trg_fpaths = sorted(trg_fpaths)
            assert len(fpaths) == len(
                trg_fpaths
            ), "the number of source language data files must equal \
                with that of source language"

            for fpath, trg_fpath in zip(fpaths, trg_fpaths):
                with io.open(fpath, f_mode, encoding=f_encoding) as f:
                    with io.open(
                            trg_fpath, f_mode, encoding=f_encoding) as trg_f:
                        for line in zip(f, trg_f):
                            fields = [field.strip(endl) for field in line]
G
guosheng 已提交
376 377 378
                            yield fields

    @staticmethod
G
guosheng 已提交
379
    def load_dict(dict_path, reverse=False, byte_data=False):
G
guosheng 已提交
380
        word_dict = {}
G
guosheng 已提交
381 382 383
        (f_mode, f_encoding,
         endl) = ("rb", None, b"\n") if byte_data else ("r", "utf8", "\n")
        with io.open(dict_path, f_mode, encoding=f_encoding) as fdict:
G
guosheng 已提交
384 385
            for idx, line in enumerate(fdict):
                if reverse:
G
guosheng 已提交
386
                    word_dict[idx] = line.strip(endl)
G
guosheng 已提交
387
                else:
G
guosheng 已提交
388
                    word_dict[line.strip(endl)] = idx
G
guosheng 已提交
389 390 391 392 393 394 395
        return word_dict

    def get_vocab_summary(self):
        return len(self._src_vocab), len(
            self._trg_vocab), self._bos_idx, self._eos_idx, self._unk_idx

    def __getitem__(self, idx):
G
guosheng 已提交
396 397
        return (self._src_seq_ids[idx], self._trg_seq_ids[idx]
                ) if self._trg_seq_ids else self._src_seq_ids[idx]
G
guosheng 已提交
398 399 400 401 402 403 404 405 406

    def __len__(self):
        return len(self._sample_infos)


class Seq2SeqBatchSampler(BatchSampler):
    def __init__(self,
                 dataset,
                 batch_size,
G
guosheng 已提交
407 408
                 pool_size=10000,
                 sort_type=SortType.NONE,
G
guosheng 已提交
409 410
                 min_length=0,
                 max_length=100,
G
guosheng 已提交
411
                 shuffle=False,
G
guosheng 已提交
412 413 414
                 shuffle_batch=False,
                 use_token_batch=False,
                 clip_last_batch=False,
G
guosheng 已提交
415
                 distribute_mode=True,
G
guosheng 已提交
416 417 418 419 420 421 422
                 seed=0):
        for arg, value in locals().items():
            if arg != "self":
                setattr(self, "_" + arg, value)
        self._random = np.random
        self._random.seed(seed)
        # for multi-devices
G
guosheng 已提交
423
        self._distribute_mode = distribute_mode
G
guosheng 已提交
424 425 426 427 428 429 430
        self._nranks = ParallelEnv().nranks
        self._local_rank = ParallelEnv().local_rank
        self._device_id = ParallelEnv().dev_id

    def __iter__(self):
        # global sort or global shuffle
        if self._sort_type == SortType.GLOBAL:
G
guosheng 已提交
431 432
            infos = sorted(
                self._dataset._sample_infos, key=lambda x: x.max_len)
G
guosheng 已提交
433 434
        else:
            if self._shuffle:
435
                infos = self._dataset._sample_infos
G
guosheng 已提交
436 437
                self._random.shuffle(infos)
            else:
438
                infos = self._dataset._sample_infos
G
guosheng 已提交
439 440 441 442 443 444 445 446 447 448 449 450 451

            if self._sort_type == SortType.POOL:
                reverse = True
                for i in range(0, len(infos), self._pool_size):
                    # to avoid placing short next to long sentences
                    reverse = not reverse
                    infos[i:i + self._pool_size] = sorted(
                        infos[i:i + self._pool_size],
                        key=lambda x: x.max_len,
                        reverse=reverse)

        batches = []
        batch_creator = TokenBatchCreator(
G
guosheng 已提交
452 453 454
            self.
            _batch_size) if self._use_token_batch else SentenceBatchCreator(
                self._batch_size * self._nranks)
G
guosheng 已提交
455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477
        batch_creator = MinMaxFilter(self._max_length, self._min_length,
                                     batch_creator)

        for info in infos:
            batch = batch_creator.append(info)
            if batch is not None:
                batches.append(batch)

        if not self._clip_last_batch and len(batch_creator.batch) != 0:
            batches.append(batch_creator.batch)

        if self._shuffle_batch:
            self._random.shuffle(batches)

        if not self._use_token_batch:
            # when producing batches according to sequence number, to confirm
            # neighbor batches which would be feed and run parallel have similar
            # length (thus similar computational cost) after shuffle, we as take
            # them as a whole when shuffling and split here
            batches = [[
                batch[self._batch_size * i:self._batch_size * (i + 1)]
                for i in range(self._nranks)
            ] for batch in batches]
G
guosheng 已提交
478
            batches = list(itertools.chain.from_iterable(batches))
G
guosheng 已提交
479 480 481

        # for multi-device
        for batch_id, batch in enumerate(batches):
G
guosheng 已提交
482 483
            if not self._distribute_mode or (
                    batch_id % self._nranks == self._local_rank):
G
guosheng 已提交
484 485
                batch_indices = [info.i for info in batch]
                yield batch_indices
G
guosheng 已提交
486 487 488 489
        if self._distribute_mode and len(batches) % self._nranks != 0:
            if self._local_rank >= len(batches) % self._nranks:
                # use previous data to pad
                yield batch_indices
G
guosheng 已提交
490 491

    def __len__(self):
492 493 494 495 496 497 498 499
        if not self._use_token_batch:
            batch_number = (
                len(self._dataset) + self._batch_size * self._nranks - 1) // (
                    self._batch_size * self._nranks)
        else:
            # TODO(guosheng): fix the uncertain length
            batch_number = 1
        return batch_number