featurize.py 9.9 KB
Newer Older
1
#coding=utf-8
W
wangmeng28 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237
"""
Convert the raw json data into training and validation examples.
"""
from collections import Counter
import json
import os
import io
import string

import click
import numpy as np
import ciseau

from vocab import Vocab
from evaluate import normalize_answer

# Constants
UNK = "<UNK>"
SOS = "<SOS>"
EOS = "<EOS>"
PAD = "<PAD>"

splits = ["train", "dev"]

ARTICLES = {"a", "an", "the", "of"}

# Keep the random embedding matrix the same between runs.
np.random.seed(1234)


def data_stream(path):
    """ Given a path json data in Pranav format, convert it to a stream
    question/context/answers tuple."""
    with io.open(path, "r") as handle:
        raw_data = json.load(handle)["data"]
    for ex in raw_data:
        for paragraph in ex["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                answers = qa["answers"]
                if "id" not in qa:
                    qa_id = -1
                else:
                    qa_id = qa["id"]
                yield question, context, answers, qa_id


def build_vocabulary(datadir, outdir, glove_path):
    """Construct the vocabulary object used throughout."""
    # We're not going to backprop through the word vectors
    # both train and dev words end up in the vocab.
    counter = Counter()
    for split in splits:
        datapath = os.path.join(datadir, split + ".json")

        for question, context, _, _ in data_stream(datapath):
            for word in ciseau.tokenize(question, normalize_ascii=False):
                counter[normalize(word)] += 1
            for word in ciseau.tokenize(context, normalize_ascii=False):
                counter[normalize(word)] += 1

    common_words = [UNK, SOS, EOS, PAD] + [w for w, _ in counter.most_common()]

    vocab_path = os.path.join(outdir, "vocab.txt")
    with io.open(vocab_path, "w", encoding="utf8") as handle:
        handle.write("\n".join(common_words))

    return Vocab(outdir)


def normalize_answer_tokens(tokens):
    start = 0
    end = len(tokens)

    while end - start > 1:
        first_token = tokens[start].rstrip().lower()
        if first_token in string.punctuation or first_token in ARTICLES:
            start += 1
        else:
            break
    while end - start > 1:
        last_token = tokens[end - 1].rstrip().lower()
        if last_token in string.punctuation:
            end -= 1
        else:
            break
    return start, end


def tokenize_example(question, context, answers, strip_labels=True):
    # Q: How should we choose the right answer
    answer = answers[0]["text"]
    answer_start = answers[0]["answer_start"]

    if strip_labels:
        answer_tokens = ciseau.tokenize(answer, normalize_ascii=False)
        start_offset, end_offset = normalize_answer_tokens(answer_tokens)
        answer = "".join(answer_tokens[start_offset:end_offset])
        # add back the piece that was stripped off:
        answer_start = answer_start + len("".join(answer_tokens[:start_offset]))

    # replace answer string with placeholder
    placeholder = "XXXX"
    new_context = context[:answer_start] + placeholder + context[answer_start +
                                                                 len(answer):]

    token_context = ciseau.sent_tokenize(new_context, keep_whitespace=True)
    token_question = ciseau.tokenize(question)

    sentence_label = None
    for sent_idx, sent in enumerate(token_context):
        answer_start = None
        for idx, word in enumerate(sent):
            if placeholder in word:
                answer_start = idx
                break

        if answer_start is None:
            continue

        sentence_label = sent_idx

        # deal with cases where the answer is in the middle
        # of the word
        answer = word.replace(placeholder, answer)
        token_answer = ciseau.tokenize(answer)

        answer_end = answer_start + len(token_answer) - 1
        answer_sent = sent[:answer_start] + token_answer + sent[answer_start +
                                                                1:]
        break

    token_context[sentence_label] = answer_sent

    return token_question, token_context, sentence_label, answer_start, answer_end


def normalize(word):
    return word.strip()


def same_as_question_feature(question_idxs, context_idxs, vocab):
    question_words = [vocab.idx_to_word(idx) for idx in question_idxs]

    # remove stop word and puncutation
    question_words = set([
        w.strip().lower() for w in question_words
        if w not in ARTICLES and w not in string.punctuation
    ])

    features = []
    for word_idx in context_idxs:
        word = vocab.idx_to_word(word_idx)
        features.append(int(word.strip().lower() in question_words))

    return features


def repeated_word_features(context_idxs, vocab):
    context_words = [vocab.idx_to_word(idx) for idx in context_idxs]

    word_counter = {}
    for word in context_words:
        canon = word.strip().lower()
        if canon in word_counter:
            word_counter[canon] += 1
        else:
            word_counter[canon] = 1

    max_occur = max(word_counter.values())
    min_occur = min(word_counter.values())
    occur_range = max(1.0, max_occur - min_occur)

    repeated_words = []
    repeated_word_intensity = []

    for word in context_words:
        canon = word.strip().lower()
        count = word_counter[canon]
        repeated = float(count > 1 and canon not in ARTICLES and
                         canon not in string.punctuation)
        intensity = float((count - min_occur) / occur_range)

        repeated_words.append(repeated)
        repeated_word_intensity.append(intensity)

    return repeated_words, repeated_word_intensity


def convert_example_to_indices(example, outfile, vocab):
    print("Processing {}".format(outfile))
    question, context, answers, qa_id = example

    tokenized = tokenize_example(question, context, answers, strip_labels=True)
    token_question, token_context, ans_sent, ans_start, ans_end = tokenized

    # Convert to indices
    question_idxs = [vocab.word_to_idx(normalize(w)) for w in token_question]

    # + 1 for end of sentence
    sent_lengths = [len(sent) + 1 for sent in token_context]
    context_idxs = []
    for sent in token_context:
        for w in sent:
            context_idxs.append(vocab.word_to_idx(normalize(w)))
        context_idxs.append(vocab.eos)

    same_as_question = same_as_question_feature(question_idxs, context_idxs,
                                                vocab)

    repeated_words, repeated_intensity = repeated_word_features(context_idxs,
                                                                vocab)

    features = {
        "question": question_idxs,
        "context": context_idxs,
        "ans_sentence": ans_sent,
        "ans_start": ans_start,
        "ans_end": ans_end,
        "sent_lengths": sent_lengths,
        "same_as_question_word": same_as_question,
        "repeated_words": repeated_words,
        "repeated_intensity": repeated_intensity,
        "qa_id": qa_id
    }

    # Hack!: This is not a great way to save indices...
    with io.open(outfile, "w", encoding="utf8") as handle:
        handle.write(unicode(json.dumps(features, ensure_ascii=False)))


def featurize_example(question, context, vocab):
    # Convert to indices
    question_idxs = [
        vocab.word_to_idx(normalize(w))
238 239
        for w in ciseau.tokenize(
            question, normalize_ascii=False)
W
wangmeng28 已提交
240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
    ]

    context_sents = ciseau.sent_tokenize(
        context, keep_whitespace=True, normalize_ascii=False)
    # + 1 for end of sentence
    sent_lengths = [len(sent) + 1 for sent in context_sents]
    context_idxs = []
    for sent in context_sents:
        for w in sent:
            context_idxs.append(vocab.word_to_idx(normalize(w)))
        context_idxs.append(vocab.eos)

    same_as_question = same_as_question_feature(question_idxs, context_idxs,
                                                vocab)
    repeated_words, repeated_intensity = repeated_word_features(context_idxs,
                                                                vocab)

    return (question_idxs, context_idxs, same_as_question, repeated_words,
            repeated_intensity, sent_lengths), context_sents


def random_sample(data, k, replace=False):
    indices = np.arange(len(data))
    chosen_indices = np.random.choice(indices, k, replace=replace)
    return [data[idx] for idx in chosen_indices]


@click.command()
@click.option("--datadir", type=str, help="Path to raw data")
@click.option("--outdir", type=str, help="Path to save the result")
@click.option("--glove-path", default="/mnt/data/jmiller/glove.840B.300d.txt")
def preprocess(datadir, outdir, glove_path):
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    print("Constructing vocabularies...")
    vocab = build_vocabulary(datadir, outdir, glove_path)
    print("Finished...")

    print("Building word embedding matrix...")
    vocab.construct_embedding_matrix(glove_path)
    print("Finished...")

    # Create training featurizations
    for split in splits:
        results_path = os.path.join(outdir, split)
        os.makedirs(results_path)

        # process each example
        examples = list(data_stream(os.path.join(datadir, split + ".json")))

        for idx, example in enumerate(examples):
            outfile = os.path.join(results_path, str(idx) + ".json")
            convert_example_to_indices(example, outfile, vocab)

    print("Building evaluation featurization...")
    eval_feats = []
    for question, context, _, qa_id in data_stream(
            os.path.join(datadir, "dev.json")):
        features, tokenized_context = featurize_example(question, context,
                                                        vocab)
        eval_feats.append((qa_id, tokenized_context, features))

    with io.open(
            os.path.join(outdir, "eval.json"), "w", encoding="utf8") as handle:
        handle.write(unicode(json.dumps(eval_feats, ensure_ascii=False)))


if __name__ == "__main__":
    preprocess()