data.py

#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The file_reader converts raw corpus to input.
"""

import os

import paddle
import numpy as np

# We use "\002" to separate sentence characters and sequence labels,
# for example: 除\002了\002他\002续\002任\002十\002二\002届\002政\002协\002委\002员
#              p-B\002p-I\002r-B\002v-B\002v-I\002m-B\002m-I\002m-I\002ORG-B\002ORG-I\002n-B\002n-I\002
CHAR_DELIMITER = "\002"


def load_kv_dict(dict_path,
                 delimiter="\t",
                 key_func=None,
                 value_func=None,
                 reverse=False):
    """
    Load key-value dict from file
    """
    vocab = {}
    for line in open(dict_path, "r", encoding='utf8'):
        terms = line.strip("\n").split(delimiter)
        if len(terms) != 2:
            continue
        if reverse:
            value, key = terms
        else:
            key, value = terms
        if key in vocab:
            raise KeyError("key duplicated with [%s]" % (key))
        if key_func:
            key = key_func(key)
        if value_func:
            value = value_func(value)
        vocab[key] = value
    return vocab


def convert_tokens_to_ids(tokens, vocab, oov_replace=None, token_replace=None):
    """convert tokens to token indexs"""
    token_ids = []
    oov_replace_token = vocab.get(oov_replace) if oov_replace else None
    for token in tokens:
        if token_replace:
            token = token_replace.get(token, token)
        token_id = vocab.get(token, oov_replace_token)
        token_ids.append(token_id)

    return token_ids


def batch_padding_fn(max_seq_len):
    def pad_batch_to_max_seq_len(batch):
        batch_max_seq_len = min(
            max([len(sample[0]) for sample in batch]), max_seq_len)
        batch_word_ids = []
        batch_label_ids = []
        batch_lens = []

        for i, sample in enumerate(batch):
            sample_word_ids = sample[0][:batch_max_seq_len]
            sample_words_len = len(sample_word_ids)
            sample_word_ids += [
                0 for _ in range(batch_max_seq_len - sample_words_len)
            ]
            batch_word_ids.append(sample_word_ids)
            if len(sample) == 2:
                sampel_label_ids = sample[1][:batch_max_seq_len] + [
                    0 for _ in range(batch_max_seq_len - sample_words_len)
                ]
                batch_label_ids.append(sampel_label_ids)
            batch_lens.append(np.int64(sample_words_len))

        if batch_label_ids:
            return batch_word_ids, batch_lens, batch_label_ids
        else:
            return batch_word_ids, batch_lens

    return pad_batch_to_max_seq_len


def parse_lac_result(words, preds, lengths, word_vocab, label_vocab):
    """ parse padding result """
    batch_out = []
    id2word_dict = dict(zip(word_vocab.values(), word_vocab.keys()))
    id2label_dict = dict(zip(label_vocab.values(), label_vocab.keys()))
    for sent_index in range(len(lengths)):
        sent = [
            id2word_dict[index]
            for index in words[sent_index][:lengths[sent_index] - 1]
        ]
        tags = [
            id2label_dict[index]
            for index in preds[sent_index][:lengths[sent_index] - 1]
        ]

        sent_out = []
        tags_out = []
        parital_word = ""
        for ind, tag in enumerate(tags):
            # for the first word
            if parital_word == "":
                parital_word = sent[ind]
                tags_out.append(tag.split('-')[0])
                continue

            # for the beginning of word
            if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
                sent_out.append(parital_word)
                tags_out.append(tag.split('-')[0])
                parital_word = sent[ind]
                continue

            parital_word += sent[ind]

        # append the last word, except for len(tags)=0
        if len(sent_out) < len(tags_out):
            sent_out.append(parital_word)

        batch_out.append([sent_out, tags_out])
    return batch_out


class LacDataset(paddle.io.Dataset):
    """Load the dataset and convert all the texts to ids.

        Args:
            base_path (str): the path of the dataset directory.
            word_vocab (str): The path of the word dictionary.
            label_vocab (str): The path of the label dictionary.
            word_replace_dict (str): The path of the word replacement Dictionary.
            mode (str, optional): The load mode, "train", "test" or "infer". Defaults to 'train', meaning load the train dataset.
        """

    def __init__(self,
                 base_path,
                 word_vocab,
                 label_vocab,
                 word_replace_dict,
                 mode='train'):
        self.mode = mode
        self.base_path = base_path
        self.word_vocab = word_vocab
        self.label_vocab = label_vocab
        self.word_replace_dict = word_replace_dict

        # Calculate vocab size and labels number, note: vocab value strats from 0.
        self.vocab_size = max(self.word_vocab.values()) + 1
        self.num_labels = max(self.label_vocab.values()) + 1

        if self.mode in {"train", "test", "infer"}:
            self.dataset_path = os.path.join(self.base_path,
                                             "%s.tsv" % self.mode)
            self._read_file()
        else:
            raise ValueError(
                'Invalid mode: %s. Only support "train", "test" and "infer"' %
                self.mode)

    def __len__(self):
        return self.total

    def __getitem__(self, index):
        if self.mode == "infer":
            return [self.word_ids[index]]
        else:
            return [self.word_ids[index], self.label_ids[index]]

    def _read_file(self):
        self.word_ids = []
        self.label_ids = []
        self.total = 0
        with open(self.dataset_path, "r", encoding="utf-8") as fread:
            if self.mode != "infer":
                next(fread)
            for line in fread:
                line = line.strip()
                if self.mode == "infer":
                    words = list(line)
                else:
                    words, labels = line.split("\t")
                    words = words.split(CHAR_DELIMITER)

                tmp_word_ids = convert_tokens_to_ids(
                    words,
                    self.word_vocab,
                    oov_replace="OOV",
                    token_replace=self.word_replace_dict)

                self.word_ids.append(tmp_word_ids)
                if self.mode != "infer":
                    tmp_label_ids = convert_tokens_to_ids(
                        labels.split(CHAR_DELIMITER),
                        self.label_vocab,
                        oov_replace="O")
                    self.label_ids.append(tmp_label_ids)
                    assert len(tmp_word_ids) == len(
                        tmp_label_ids
                    ), "The word ids %s is not match with the label ids %s" % (
                        tmp_word_ids, tmp_label_ids)

                self.total += 1