cls.py

"""this file is adapted from https://github.com/zihangdai/xlnet"""

import io
import os
import types
import csv
import numpy as np
import sentencepiece as spm

from classifier_utils import PaddingInputExample
from classifier_utils import convert_single_example
from prepro_utils import preprocess_text, encode_ids


class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def __init__(self, args):
        self.data_dir = args.data_dir
        self.max_seq_length = args.max_seq_length
        self.uncased = args.uncased
        np.random.seed(args.random_seed)

        sp = spm.SentencePieceProcessor()
        sp.Load(args.spiece_model_file)

        def tokenize_fn(text):
            text = preprocess_text(text, lower=self.uncased)
            return encode_ids(sp, text)

        self.tokenize_fn = tokenize_fn

        self.current_train_example = -1
        self.num_examples = {'train': -1, 'dev': -1, 'test': -1}
        self.current_train_epoch = -1

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_test_examples(self, data_dir):
        """Gets a collection of `InputExample`s for prediction."""
        raise NotImplementedError()

    def get_labels(self):
        """Gets the list of labels for this data set."""
        raise NotImplementedError()

    def convert_example(self, index, example, labels, max_seq_length,
                        tokenize_fn):
        """Converts a single `InputExample` into a single `InputFeatures`."""
        feature = convert_single_example(index, example, labels, max_seq_length,
                                         tokenize_fn)
        return feature

    def generate_instance(self, feature):
        """
        generate instance with given feature

        Args:
            feature: InputFeatures(object). A single set of features of data.
        """
        return [
            feature.input_ids, feature.segment_ids, input_pos, feature.label_id
        ]

    def prepare_batch_data(self, batch_data, is_regression):
        """Generate numpy tensors"""
        input_ids = np.expand_dims(
            np.array([inst[0] for inst in batch_data]).astype('int64'), axis=-1)
        input_mask = np.array(
            [inst[1] for inst in batch_data]).astype('float32')
        segment_ids = np.array([inst[2] for inst in batch_data]).astype('int64')
        labels = np.expand_dims(
            np.array([inst[3] for inst in batch_data]).astype(
                'int64' if not is_regression else 'float32'),
            axis=-1)
        is_real_example = np.array(
            [inst[4] for inst in batch_data]).astype('int64')

        return [input_ids, input_mask, segment_ids, labels, is_real_example]

    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
        with io.open(input_file, "r", encoding="utf8") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
                if len(line) == 0: continue
                lines.append(line)
            return lines

    def get_num_examples(self, phase):
        """Get number of examples for train, dev or test."""
        if phase not in ['train', 'dev', 'test']:
            raise ValueError(
                "Unknown phase, which should be in ['train', 'dev', 'test'].")
        return self.num_examples[phase]

    def get_train_progress(self):
        """Gets progress for training phase."""
        return self.current_train_example, self.current_train_epoch

    def data_generator(self,
                       batch_size,
                       is_regression,
                       phase='train',
                       epoch=1,
                       dev_count=1,
                       shuffle=True):
        """
        Generate data for train, dev or test.
    
        Args:
          batch_size: int. The batch size of generated data.
          phase: string. The phase for which to generate data.
          epoch: int. Total epoches to generate data.
          shuffle: bool. Whether to shuffle examples.
        """
        if phase == 'train':
            examples = self.get_train_examples(self.data_dir)
            self.num_examples['train'] = len(examples)
        elif phase == 'dev':
            examples = self.get_dev_examples(self.data_dir)
            self.num_examples['dev'] = len(examples)
        elif phase == 'test':
            examples = self.get_test_examples(self.data_dir)
            self.num_examples['test'] = len(examples)
        else:
            raise ValueError(
                "Unknown phase, which should be in ['train', 'dev', 'test'].")

        def instance_reader():
            label_list = self.get_labels() if not is_regression else None
            for epoch_index in range(epoch):
                if shuffle:
                    np.random.shuffle(examples)
                if phase == 'train':
                    self.current_train_epoch = epoch_index
                for (index, example) in enumerate(examples):
                    if phase == 'train':
                        self.current_train_example = index + 1
                    feature = convert_single_example(index, example, label_list,
                                                     self.max_seq_length,
                                                     self.tokenize_fn)
                    instance = [
                        feature.input_ids, feature.input_mask,
                        feature.segment_ids, feature.label_id,
                        feature.is_real_example
                    ]
                    yield instance

        def batch_reader(reader, batch_size):
            batch = []
            for instance in reader():
                if len(batch) < batch_size:
                    batch.append(instance)
                else:
                    yield batch
                    batch = [instance]

            if len(batch) > 0:
                yield batch

        def wrapper():
            all_dev_batches = []
            for batch_data in batch_reader(instance_reader, batch_size):
                batch_data = self.prepare_batch_data(batch_data, is_regression)
                if len(all_dev_batches) < dev_count:
                    all_dev_batches.append(batch_data)

                if len(all_dev_batches) == dev_count:
                    for batch in all_dev_batches:
                        yield batch
                    all_dev_batches = []

        return wrapper


class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.

    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, input_ids, input_mask, segment_ids, label_id):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.label_id = label_id


class GLUEProcessor(DataProcessor):
    def __init__(self, args):
        super(GLUEProcessor, self).__init__(args)
        self.train_file = "train.tsv"
        self.dev_file = "dev.tsv"
        self.test_file = "test.tsv"
        self.label_column = None
        self.text_a_column = None
        self.text_b_column = None
        self.contains_header = True
        self.test_text_a_column = None
        self.test_text_b_column = None
        self.test_contains_header = True

    def get_train_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, self.train_file)), "train")

    def get_dev_examples(self, data_dir):
        """See base class."""
        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, self.dev_file)), "dev")

    def get_test_examples(self, data_dir):
        """See base class."""
        if self.test_text_a_column is None:
            self.test_text_a_column = self.text_a_column
        if self.test_text_b_column is None:
            self.test_text_b_column = self.text_b_column

        return self._create_examples(
            self._read_tsv(os.path.join(data_dir, self.test_file)), "test")

    def get_labels(self):
        """See base class."""
        return ["0", "1"]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0 and self.contains_header and set_type != "test":
                continue
            if i == 0 and self.test_contains_header and set_type == "test":
                continue
            guid = "%s-%s" % (set_type, i)

            a_column = (self.text_a_column
                        if set_type != "test" else self.test_text_a_column)
            b_column = (self.text_b_column
                        if set_type != "test" else self.test_text_b_column)

            # there are some incomplete lines in QNLI
            if len(line) <= a_column:
                tf.logging.warning('Incomplete line, ignored.')
                continue
            text_a = line[a_column]

            if b_column is not None:
                if len(line) <= b_column:
                    tf.logging.warning('Incomplete line, ignored.')
                    continue
                text_b = line[b_column]
            else:
                text_b = None

            if set_type == "test":
                label = self.get_labels()[0]
            else:
                if len(line) <= self.label_column:
                    tf.logging.warning('Incomplete line, ignored.')
                    continue
                label = line[self.label_column]
            examples.append(
                InputExample(
                    guid=guid, text_a=text_a, text_b=text_b, label=label))
        return examples


class Yelp5Processor(DataProcessor):
    def __init__(self, args):
        super(Yelp5Processor, self).__init__(args)

    def get_train_examples(self, data_dir):
        return self._create_examples(os.path.join(data_dir, "train.csv"))

    def get_dev_examples(self, data_dir):
        return self._create_examples(os.path.join(data_dir, "test.csv"))

    def get_labels(self):
        """See base class."""
        return ["1", "2", "3", "4", "5"]

    def _create_examples(self, input_file):
        """Creates examples for the training and dev sets."""
        examples = []
        with tf.gfile.Open(input_file) as f:
            reader = csv.reader(f)
            for i, line in enumerate(reader):

                label = line[0]
                text_a = line[1].replace('""', '"').replace('\\"', '"')
                examples.append(
                    InputExample(
                        guid=str(i), text_a=text_a, text_b=None, label=label))
        return examples


class ImdbProcessor(DataProcessor):
    def __init__(self, args):
        super(ImdbProcessor, self).__init__(args)

    def get_labels(self):
        return ["neg", "pos"]

    def get_train_examples(self, data_dir):
        return self._create_examples(os.path.join(data_dir, "train"))

    def get_dev_examples(self, data_dir):
        return self._create_examples(os.path.join(data_dir, "test"))

    def _create_examples(self, data_dir):
        examples = []
        for label in ["neg", "pos"]:
            cur_dir = os.path.join(data_dir, label)
            for filename in os.listdir(cur_dir):
                if not filename.endswith("txt"): continue

                path = os.path.join(cur_dir, filename)
                with io.open(path, 'r', encoding='utf8') as f:
                    text = f.read().strip().replace("<br />", " ")
                examples.append(
                    InputExample(
                        guid="unused_id", text_a=text, text_b=None,
                        label=label))
        return examples


class MnliMatchedProcessor(GLUEProcessor):
    def __init__(self, args):
        super(MnliMatchedProcessor, self).__init__(args)
        self.dev_file = "dev_matched.tsv"
        self.test_file = "test_matched.tsv"
        self.label_column = -1
        self.text_a_column = 8
        self.text_b_column = 9

    def get_labels(self):
        return ["contradiction", "entailment", "neutral"]


class MnliMismatchedProcessor(MnliMatchedProcessor):
    def __init__(self, args):
        super(MnliMismatchedProcessor, self).__init__(args)
        self.dev_file = "dev_mismatched.tsv"
        self.test_file = "test_mismatched.tsv"


class StsbProcessor(GLUEProcessor):
    def __init__(self, args):
        super(StsbProcessor, self).__init__(args)
        self.label_column = 9
        self.text_a_column = 7
        self.text_b_column = 8

    def get_labels(self):
        return [0.0]

    def _create_examples(self, lines, set_type):
        """Creates examples for the training and dev sets."""
        examples = []
        for (i, line) in enumerate(lines):
            if i == 0 and self.contains_header and set_type != "test":
                continue
            if i == 0 and self.test_contains_header and set_type == "test":
                continue
            guid = "%s-%s" % (set_type, i)

            a_column = (self.text_a_column
                        if set_type != "test" else self.test_text_a_column)
            b_column = (self.text_b_column
                        if set_type != "test" else self.test_text_b_column)

            # there are some incomplete lines in QNLI
            if len(line) <= a_column:
                tf.logging.warning('Incomplete line, ignored.')
                continue
            text_a = line[a_column]

            if b_column is not None:
                if len(line) <= b_column:
                    tf.logging.warning('Incomplete line, ignored.')
                    continue
                text_b = line[b_column]
            else:
                text_b = None

            if set_type == "test":
                label = self.get_labels()[0]
            else:
                if len(line) <= self.label_column:
                    tf.logging.warning('Incomplete line, ignored.')
                    continue
                label = float(line[self.label_column])
            examples.append(
                InputExample(
                    guid=guid, text_a=text_a, text_b=text_b, label=label))

        return examples


if __name__ == '__main__':
    pass