#coding:utf-8 # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License" # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Run BERT on SQuAD 1.1 and SQuAD 2.0.""" import json import os from paddlehub.reader import tokenization from paddlehub.common.dir import DATA_HOME from paddlehub.common.logger import logger from paddlehub.dataset.base_nlp_dataset import MRCDataset _DATA_URL = "https://bj.bcebos.com/paddlehub-dataset/squad.tar.gz" class SquadExample(object): """A single training/test example for simple sequence classification. For examples without an answer, the start and end position are -1. """ def __init__(self, qas_id, question_text, doc_tokens, orig_answer_text=None, start_position=None, end_position=None, is_impossible=False): self.qas_id = qas_id self.question_text = question_text self.doc_tokens = doc_tokens self.orig_answer_text = orig_answer_text self.start_position = start_position self.end_position = end_position self.is_impossible = is_impossible def __str__(self): return self.__repr__() def __repr__(self): s = "" s += "qas_id: %s" % (tokenization.printable_text(self.qas_id)) s += ", question_text: %s" % (tokenization.printable_text( self.question_text)) s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens)) if self.start_position: s += ", start_position: %d" % (self.start_position) if self.start_position: s += ", end_position: %d" % (self.end_position) if self.start_position: s += ", is_impossible: %r" % (self.is_impossible) return s class SQUAD(MRCDataset): """A single set of features of data.""" def __init__( self, version_2_with_negative=False, tokenizer=None, max_seq_len=None, max_query_len=64, doc_stride=128, ): self.version_2_with_negative = version_2_with_negative if not version_2_with_negative: train_file = "train-v1.1.json" dev_file = "dev-v1.1.json" else: train_file = "train-v2.0.json" dev_file = "dev-v2.0.json" dataset_dir = os.path.join(DATA_HOME, "squad_data") base_path = self._download_dataset(dataset_dir, url=_DATA_URL) super(SQUAD, self).__init__( base_path=base_path, train_file=train_file, dev_file=dev_file, test_file=None, label_file=None, label_list=None, tokenizer=tokenizer, max_seq_len=max_seq_len, max_query_len=max_query_len, doc_stride=doc_stride, ) def _read_file(self, input_file, phase=None): """Read a SQuAD json file into a list of SquadExample.""" with open(input_file, "r") as reader: input_data = json.load(reader)["data"] def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( c) == 0x202F: return True return False examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: paragraph_text = paragraph["context"] doc_tokens = [] char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: doc_tokens.append(c) else: doc_tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(doc_tokens) - 1) for qa in paragraph["qas"]: qas_id = qa["id"] question_text = qa["question"] start_position = None end_position = None orig_answer_text = None is_impossible = False if phase in ["train", "dev"]: if self.version_2_with_negative: is_impossible = qa["is_impossible"] if phase == "train" and (len(qa["answers"]) != 1) and (not is_impossible): print(qa) raise ValueError( "For training, each question should have exactly 1 answer." ) if not is_impossible: answer = qa["answers"][0] orig_answer_text = answer["text"] answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] end_position = char_to_word_offset[ answer_offset + answer_length - 1] # Only add answers where the text can be exactly recovered from the # document. If this CAN'T happen it's likely due to weird Unicode # stuff so we will just skip the example. # # Note that this means for training mode, every example is NOT # guaranteed to be preserved. actual_text = " ".join( doc_tokens[start_position:(end_position + 1)]) cleaned_answer_text = " ".join( tokenization.whitespace_tokenize( orig_answer_text)) if actual_text.find(cleaned_answer_text) == -1: logger.warning( "Could not find answer: '%s' vs. '%s'" % (actual_text, cleaned_answer_text)) continue else: start_position = -1 end_position = -1 orig_answer_text = "" example = SquadExample( qas_id=qas_id, question_text=question_text, doc_tokens=doc_tokens, orig_answer_text=orig_answer_text, start_position=start_position, end_position=end_position, is_impossible=is_impossible) examples.append(example) return examples if __name__ == "__main__": from paddlehub.tokenizer.bert_tokenizer import BertTokenizer tokenizer = BertTokenizer(vocab_file='vocab.txt') ds = SQUAD( version_2_with_negative=True, tokenizer=tokenizer, max_seq_len=512) print("first 10 dev") for e in ds.get_dev_examples()[:2]: print(e) print("first 10 train") for e in ds.get_train_examples()[:2]: print(e) print("first 10 test") for e in ds.get_test_examples()[:2]: print(e) print(ds)