data.py

#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
The file_reader converts raw corpus to input.
"""

import os

import paddle
import numpy as np

# We use "\002" to separate sentence characters and sequence labels,
# for example: 除\002了\002他\002续\002任\002十\002二\002届\002政\002协\002委\002员
#              p-B\002p-I\002r-B\002v-B\002v-I\002m-B\002m-I\002m-I\002ORG-B\002ORG-I\002n-B\002n-I\002
CHAR_DELIMITER = "\002"


class LacDataset(paddle.io.Dataset):
    """Load the dataset and convert all the texts to ids.

        Args:
            base_path (str): the path of the dataset directory.
            word_vocab (str): The path of the word dictionary.
            label_vocab (str): The path of the label dictionary.
            word_replace_dict (str): The path of the word replacement Dictionary.
            mode (str, optional): The load mode, "train", "test" or "infer". Defaults to 'train', meaning load the train dataset.
        """

    def __init__(self, base_path, mode='train'):
        self.mode = mode
        self.base_path = base_path
        word_dict_path = os.path.join(self.base_path, 'word.dic')
        label_dict_path = os.path.join(self.base_path, 'tag.dic')
        word_rep_dict_path = os.path.join(self.base_path, 'q2b.dic')
        self.word_vocab = self._load_vocab(word_dict_path)
        self.label_vocab = self._load_vocab(label_dict_path)
        self.word_replace_dict = self._load_vocab(word_rep_dict_path)

        # Calculate vocab size and labels number, note: vocab value strats from 0.
        self.vocab_size = len(self.word_vocab)
        self.num_labels = len(self.label_vocab)

        if self.mode in {"train", "test", "infer"}:
            self.dataset_path = os.path.join(self.base_path,
                                             "%s.tsv" % self.mode)
            self._read_file()
        else:
            raise ValueError(
                'Invalid mode: %s. Only support "train", "test" and "infer"' %
                self.mode)

    def __len__(self):
        return self.total

    def __getitem__(self, index):
        if self.mode == "infer":
            return [self.word_ids[index], len(self.word_ids[index])]
        else:
            return [
                self.word_ids[index], len(self.word_ids[index]),
                self.label_ids[index]
            ]

    def _read_file(self):
        self.word_ids = []
        self.label_ids = []
        self.total = 0
        with open(self.dataset_path, "r", encoding="utf-8") as fread:
            if self.mode != "infer":
                next(fread)
            for line in fread:
                line = line.strip()
                if self.mode == "infer":
                    words = list(line)
                else:
                    words, labels = line.split("\t")
                    words = words.split(CHAR_DELIMITER)

                tmp_word_ids = self._convert_tokens_to_ids(
                    words,
                    self.word_vocab,
                    oov_replace="OOV",
                    token_replace=self.word_replace_dict)

                self.word_ids.append(tmp_word_ids)
                if self.mode != "infer":
                    tmp_label_ids = self._convert_tokens_to_ids(
                        labels.split(CHAR_DELIMITER),
                        self.label_vocab,
                        oov_replace="O")
                    self.label_ids.append(tmp_label_ids)
                    assert len(tmp_word_ids) == len(
                        tmp_label_ids
                    ), "The word ids %s is not match with the label ids %s" % (
                        tmp_word_ids, tmp_label_ids)

                self.total += 1

    def _load_vocab(self, dict_path):
        """
        Load vocab from file
        """
        vocab = {}
        reverse = None
        with open(dict_path, "r", encoding='utf8') as fin:
            for i, line in enumerate(fin):
                terms = line.strip("\n").split("\t")
                if len(terms) == 2:
                    if reverse == None:
                        reverse = True if terms[0].isdigit() else False
                    if reverse:
                        value, key = terms
                    else:
                        key, value = terms
                elif len(terms) == 1:
                    key, value = terms[0], i
                else:
                    raise ValueError("Error line: %s in file: %s" %
                                     (line, dict_path))
                vocab[key] = value
        return vocab

    def _convert_tokens_to_ids(self,
                               tokens,
                               vocab,
                               oov_replace=None,
                               token_replace=None):
        """convert tokens to token indexs"""
        token_ids = []
        oov_replace_token = vocab.get(oov_replace) if oov_replace else None
        for token in tokens:
            if token_replace:
                token = token_replace.get(token, token)
            token_id = vocab.get(token, oov_replace_token)
            token_ids.append(token_id)

        return token_ids


def parse_lac_result(words, preds, lengths, word_vocab, label_vocab):
    """ parse padding result """
    batch_out = []
    id2word_dict = dict(zip(word_vocab.values(), word_vocab.keys()))
    id2label_dict = dict(zip(label_vocab.values(), label_vocab.keys()))
    for sent_index in range(len(lengths)):
        sent = [
            id2word_dict[index]
            for index in words[sent_index][:lengths[sent_index]]
        ]
        tags = [
            id2label_dict[index]
            for index in preds[sent_index][:lengths[sent_index]]
        ]

        sent_out = []
        tags_out = []
        parital_word = ""
        for ind, tag in enumerate(tags):
            # for the first word
            if parital_word == "":
                parital_word = sent[ind]
                tags_out.append(tag.split('-')[0])
                continue

            # for the beginning of word
            if tag.endswith("-B") or (tag == "O" and tags[ind - 1] != "O"):
                sent_out.append(parital_word)
                tags_out.append(tag.split('-')[0])
                parital_word = sent[ind]
                continue

            parital_word += sent[ind]

        # append the last word, except for len(tags)=0
        if len(sent_out) < len(tags_out):
            sent_out.append(parital_word)

        batch_out.append([sent_out, tags_out])
    return batch_out