base_nlp_dataset.py

# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Dict, List, Optional, Union, Tuple
import csv
import io
import os

import numpy as np
import paddle

from paddlehub.env import DATA_HOME
from paddlehub.text.bert_tokenizer import BertTokenizer
from paddlehub.text.tokenizer import CustomTokenizer
from paddlehub.utils.log import logger
from paddlehub.utils.utils import download
from paddlehub.utils.xarfile import is_xarfile, unarchive


class InputExample(object):
    """
    The input data structure of Transformer modules (BERT, ERNIE and so on).
    """

    def __init__(self, guid: int, text_a: str, text_b: Optional[str] = None, label: Optional[str] = None):
        """
        The input data structure.
        Args:
          guid (:obj:`int`):
              Unique id for the input data.
          text_a (:obj:`str`, `optional`, defaults to :obj:`None`):
              The first sequence. For single sequence tasks, only this sequence must be specified.
          text_b (:obj:`str`, `optional`, defaults to :obj:`None`):
              The second sequence if sentence-pair.
          label (:obj:`str`, `optional`, defaults to :obj:`None`):
              The label of the example.
        Examples:
            .. code-block:: python
                from paddlehub.datasets.base_nlp_dataset import InputExample
                example = InputExample(guid=0,
                                text_a='15.4寸笔记本的键盘确实爽，基本跟台式机差不多了',
                                text_b='蛮喜欢数字小键盘，输数字特方便，样子也很美观，做工也相当不错',
                                label='1')
        """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

    def __str__(self):
        if self.text_b is None:
            return "text={}\tlabel={}".format(self.text_a, self.label)
        else:
            return "text_a={}\ttext_b={},label={}".format(self.text_a, self.text_b, self.label)


class BaseNLPDataset(object):
    """
    The virtual base class for nlp datasets, such TextClassificationDataset, SeqLabelingDataset, and so on.
    The base class must be supered and re-implemented the method _read_file.
    """

    def __init__(self,
                 base_path: str,
                 tokenizer: Union[BertTokenizer, CustomTokenizer],
                 max_seq_len: Optional[int] = 128,
                 mode: Optional[str] = "train",
                 data_file: Optional[str] = None,
                 label_file: Optional[str] = None,
                 label_list: Optional[List[str]] = None):
        """
        Ags:
            base_path (:obj:`str`): The directory to the whole dataset.
            tokenizer (:obj:`BertTokenizer` or :obj:`CustomTokenizer`):
                It tokenizes the text and encodes the data as model needed.
            max_seq_len (:obj:`int`, `optional`, defaults to :128):
                If set to a number, will limit the total sequence returned so that it has a maximum length.
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train, test or dev).
            data_file(:obj:`str`, `optional`, defaults to :obj:`None`):
                The data file name, which is relative to the base_path.
            label_file(:obj:`str`, `optional`, defaults to :obj:`None`):
                The label file name, which is relative to the base_path.
                It is all labels of the dataset, one line one label.
            label_list(:obj:`List[str]`, `optional`, defaults to :obj:`None`):
                The list of all labels of the dataset
        """
        self.data_file = os.path.join(base_path, data_file)
        self.label_list = label_list

        self.mode = mode
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len

        if label_file:
            self.label_file = os.path.join(base_path, label_file)
            if not self.label_list:
                self.label_list = self._load_label_data()
            else:
                logger.warning("As label_list has been assigned, label_file is noneffective")
        if self.label_list:
            self.label_map = {item: index for index, item in enumerate(self.label_list)}

    def _load_label_data(self):
        """
        Loads labels from label file.
        """
        if os.path.exists(self.label_file):
            with open(self.label_file, "r", encoding="utf8") as f:
                return f.read().strip().split("\n")
        else:
            raise RuntimeError("The file {} is not found.".format(self.label_file))

    def _download_and_uncompress_dataset(self, destination: str, url: str):
        """
        Downloads dataset and uncompresses it.
        Args:
           destination (:obj:`str`): The dataset cached directory.
           url (:obj: str): The link to be downloaded a dataset.
        """
        if not os.path.exists(destination):
            dataset_package = download(url=url, path=DATA_HOME)
            if is_xarfile(dataset_package):
                unarchive(dataset_package, DATA_HOME)
        else:
            logger.info("Dataset {} already cached.".format(destination))

    def _read_file(self, input_file: str, is_file_with_header: bool = False):
        """
        Reads the files.
        Args:
            input_file (:obj:str) : The file to be read.
            is_file_with_header(:obj:bool, `optional`, default to :obj: False) :
                Whether or not the file is with the header introduction.
        """
        raise NotImplementedError

    def get_labels(self):
        """
        Gets all labels.
        """
        return self.label_list


class TextClassificationDataset(BaseNLPDataset, paddle.io.Dataset):
    """
    The dataset class which is fit for all datatset of text classification.
    """

    def __init__(self,
                 base_path: str,
                 tokenizer: Union[BertTokenizer, CustomTokenizer],
                 max_seq_len: int = 128,
                 mode: str = "train",
                 data_file: str = None,
                 label_file: str = None,
                 label_list: list = None,
                 is_file_with_header: bool = False):
        """
        Ags:
            base_path (:obj:`str`): The directory to the whole dataset.
            tokenizer (:obj:`BertTokenizer` or :obj:`CustomTokenizer`):
                It tokenizes the text and encodes the data as model needed.
            max_seq_len (:obj:`int`, `optional`, defaults to :128):
                If set to a number, will limit the total sequence returned so that it has a maximum length.
            mode (:obj:`str`, `optional`, defaults to `train`):
                It identifies the dataset mode (train, test or dev).
            data_file(:obj:`str`, `optional`, defaults to :obj:`None`):
                The data file name, which is relative to the base_path.
            label_file(:obj:`str`, `optional`, defaults to :obj:`None`):
                The label file name, which is relative to the base_path.
                It is all labels of the dataset, one line one label.
            label_list(:obj:`List[str]`, `optional`, defaults to :obj:`None`):
                The list of all labels of the dataset
            is_file_with_header(:obj:bool, `optional`, default to :obj: False) :
                Whether or not the file is with the header introduction.
        """
        super(TextClassificationDataset, self).__init__(
            base_path=base_path,
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            mode=mode,
            data_file=data_file,
            label_file=label_file,
            label_list=label_list)
        self.examples = self._read_file(self.data_file, is_file_with_header)

        self.records = self._convert_examples_to_records(self.examples)

    def _read_file(self, input_file, is_file_with_header: bool = False) -> List[InputExample]:
        """
        Reads a tab separated value file.
        Args:
            input_file (:obj:str) : The file to be read.
            is_file_with_header(:obj:bool, `optional`, default to :obj: False) :
                Whether or not the file is with the header introduction.
        Returns:
            examples (:obj:`List[InputExample]`): All the input data.
        """
        if not os.path.exists(input_file):
            raise RuntimeError("The file {} is not found.".format(input_file))
        else:
            with io.open(input_file, "r", encoding="UTF-8") as f:
                reader = csv.reader(f, delimiter="\t", quotechar=None)
                examples = []
                seq_id = 0
                header = next(reader) if is_file_with_header else None
                for line in reader:
                    example = InputExample(guid=seq_id, label=line[0], text_a=line[1])
                    seq_id += 1
                    examples.append(example)
                return examples

    def _convert_examples_to_records(self, examples: List[InputExample]) -> List[dict]:
        """
        Converts all examples to records which the model needs.
        Args:
            examples(obj:`List[InputExample]`): All data examples returned by _read_file.
        Returns:
            records(:obj:`List[dict]`): All records which the model needs.
        """
        records = []
        for example in examples:
            record = self.tokenizer.encode(text=example.text_a, text_pair=example.text_b, max_seq_len=self.max_seq_len)
            # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab
            # When all words are not found in the vocab, the text will be dropped.
            if not record:
                logger.info(
                    "The text %s has been dropped as it has no words in the vocab after tokenization." % example.text_a)
                continue
            if example.label:
                record['label'] = self.label_map[example.label]
            records.append(record)
        return records

    def __getitem__(self, idx):
        record = self.records[idx]
        if 'label' in record.keys():
            if isinstance(self.tokenizer, BertTokenizer):
                return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'])
            elif isinstance(self.tokenizer, CustomTokenizer):
                return np.array(record['text']), np.array(record['seq_len']), np.array(record['label'])
        else:
            if isinstance(self.tokenizer, BertTokenizer):
                return np.array(record['input_ids']), np.array(record['segment_ids'])
            elif isinstance(self.tokenizer, CustomTokenizer):
                return np.array(record['text']), np.array(record['seq_len'])

    def __len__(self):
        return len(self.records)


class SeqLabelingDataset(BaseNLPDataset, paddle.io.Dataset):
    def __init__(self,
                 base_path: str,
                 tokenizer: Union[BertTokenizer, CustomTokenizer],
                 max_seq_len: int = 128,
                 mode: str = "train",
                 data_file: str = None,
                 label_file: str = None,
                 label_list: list = None,
                 split_char="\002",
                 no_entity_label="O",
                 is_file_with_header: bool = False):
        super(SeqLabelingDataset, self).__init__(
            base_path=base_path,
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            mode=mode,
            data_file=data_file,
            label_file=label_file,
            label_list=label_list)

        self.no_entity_label = no_entity_label
        self.split_char = split_char

        self.examples = self._read_file(self.data_file, is_file_with_header)
        self.records = self._convert_examples_to_records(self.examples)

    def _read_file(self, input_file, is_file_with_header: bool = False) -> List[InputExample]:
        """Reads a tab separated value file."""
        if not os.path.exists(input_file):
            raise RuntimeError("The file {} is not found.".format(input_file))
        else:
            with io.open(input_file, "r", encoding="UTF-8") as f:
                reader = csv.reader(f, delimiter="\t", quotechar=None)
                examples = []
                seq_id = 0
                header = next(reader) if is_file_with_header else None
                for line in reader:
                    example = InputExample(guid=seq_id, label=line[1], text_a=line[0])
                    seq_id += 1
                    examples.append(example)
                return examples

    def _convert_examples_to_records(self, examples: List[InputExample]) -> List[dict]:
        """
        Returns a list[dict] including all the input information what the model need.
        Args:
            examples (list): the data examples, returned by _read_file.
        Returns:
            a list with all the examples record.
        """
        records = []
        for example in examples:
            tokens, labels = self._reseg_token_label(
                tokens=example.text_a.split(self.split_char),
                labels=example.label.split(self.split_char))
            record = self.tokenizer.encode(
                text=tokens, max_seq_len=self.max_seq_len)
            # CustomTokenizer will tokenize the text firstly and then lookup words in the vocab
            # When all words are not found in the vocab, the text will be dropped.
            if not record:
                logger.info(
                    "The text %s has been dropped as it has no words in the vocab after tokenization."
                    % example.text_a)
                continue
            if labels:
                record["label"] = []
                tokens_with_specical_token = self.tokenizer.decode(
                    record, only_convert_to_tokens=True)
                tokens_index = 0
                for token in tokens_with_specical_token:
                    if tokens_index < len(
                            tokens) and token == tokens[tokens_index]:
                        record["label"].append(
                            self.label_list.index(labels[tokens_index]))
                        tokens_index += 1
                    else:
                        record["label"].append(
                            self.label_list.index(self.no_entity_label))
            records.append(record)
        return records

    def _reseg_token_label(
            self, tokens: List[str], labels: List[str] = None) -> Tuple[List[str], List[str]] or List[str]:
        if labels:
            if len(tokens) != len(labels):
                raise ValueError(
                    "The length of tokens must be same with labels")
            ret_tokens = []
            ret_labels = []
            for token, label in zip(tokens, labels):
                sub_token = self.tokenizer.tokenize(token)
                if len(sub_token) == 0:
                    continue
                ret_tokens.extend(sub_token)
                ret_labels.append(label)
                if len(sub_token) < 2:
                    continue
                sub_label = label
                if label.startswith("B-"):
                    sub_label = "I-" + label[2:]
                ret_labels.extend([sub_label] * (len(sub_token) - 1))

            if len(ret_tokens) != len(ret_labels):
                raise ValueError(
                    "The length of ret_tokens can't match with labels")
            return ret_tokens, ret_labels
        else:
            ret_tokens = []
            for token in tokens:
                sub_token = self.tokenizer.tokenize(token)
                if len(sub_token) == 0:
                    continue
                ret_tokens.extend(sub_token)
                if len(sub_token) < 2:
                    continue
            return ret_tokens, None

    def __getitem__(self, idx):
        record = self.records[idx]
        if 'label' in record.keys():
            if isinstance(self.tokenizer, BertTokenizer):
                return np.array(record['input_ids']), np.array(record['segment_ids']), np.array(record['label'])
            else:  # TODO(chenxiaojie): add CustomTokenizer supported
                raise NotImplementedError
        else:
            if isinstance(self.tokenizer, BertTokenizer):
                return np.array(record['input_ids']), np.array(record['segment_ids'])
            else:  # TODO(chenxiaojie): add CustomTokenizer supported
                raise NotImplementedError

    def __len__(self):
        return len(self.records)