diff --git a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/args.py b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/args.py index feb08370ece45832c8f74aeb273e9e6023c152ef..8f823b1a6bc90fdc056dc045961a2a687fec64b4 100644 --- a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/args.py +++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/args.py @@ -3,6 +3,12 @@ import argparse def parse_args(): parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument( + "--task_name", + default=None, + type=str, + required=True, + help="The name of the task.") parser.add_argument( "--data_path", type=str, diff --git a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py index d1d2bf2a637358636fc6bee6375631e9004e3e0c..feaed689391bd1de76f9c7acb477379f61bdd30c 100644 --- a/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py +++ b/PaddleNLP/examples/machine_reading_comprehension/DuReader-robust/run_du.py @@ -26,10 +26,13 @@ from args import parse_args import json import paddlenlp as ppnlp +from paddlenlp.datasets import SQuAD, DuReaderRobust, CMRC, DRCD from paddlenlp.data import Pad, Stack, Tuple from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer from paddlenlp.metrics.squad import squad_evaluate, compute_predictions +TASK_CLASSES = {"dureader-robust": DuReaderRobust, "cmrc": CMRC, "drcd": DRCD} + MODEL_CLASSES = { "bert": (BertForQuestionAnswering, BertTokenizer), "ernie": (ErnieForQuestionAnswering, ErnieTokenizer) @@ -89,18 +92,20 @@ def evaluate(model, data_loader, args, tokenizer, do_pred=False): start_logits=start_logits, end_logits=end_logits)) - all_predictions, _, scores_diff_json = compute_predictions( + all_predictions, _, _ = compute_predictions( data_loader.dataset.examples, data_loader.dataset.features, all_results, args.n_best_size, args.max_answer_length, args.do_lower_case, False, - 0.0, args.verbose, tokenizer) + 0.0, args.verbose, tokenizer, False) if do_pred: - with open('prediction.json', "w") as writer: + with open('prediction.json', "w", encoding='utf-8') as writer: writer.write( json.dumps( all_predictions, ensure_ascii=False, indent=4) + "\n") else: - squad_evaluate(data_loader.dataset.examples, all_predictions, - scores_diff_json, 1.0) + squad_evaluate( + examples=data_loader.dataset.examples, + preds=all_predictions, + is_whitespace_splited=False) model.train() @@ -110,13 +115,16 @@ def do_train(args): if paddle.distributed.get_world_size() > 1: paddle.distributed.init_parallel_env() + task_name = args.task_name.lower() + dataset_class = TASK_CLASSES[task_name] + args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) root = args.data_path set_seed(args) - train_ds = ppnlp.datasets.DuReaderRobust( + train_ds = dataset_class( tokenizer=tokenizer, root=root, doc_stride=args.doc_stride, @@ -141,7 +149,7 @@ def do_train(args): collate_fn=train_batchify_fn, return_list=True) - dev_ds = ppnlp.datasets.DuReaderRobust( + dev_ds = dataset_class( tokenizer=tokenizer, root=root, doc_stride=args.doc_stride, @@ -164,23 +172,6 @@ def do_train(args): collate_fn=dev_batchify_fn, return_list=True) - test_ds = ppnlp.datasets.DuReaderRobust( - tokenizer=tokenizer, - root=root, - doc_stride=args.doc_stride, - max_query_length=args.max_query_length, - max_seq_length=args.max_seq_length, - mode='test') - - test_batch_sampler = paddle.io.BatchSampler( - test_ds, batch_size=args.batch_size, shuffle=False) - - test_data_loader = DataLoader( - dataset=test_ds, - batch_sampler=test_batch_sampler, - collate_fn=dev_batchify_fn, - return_list=True) - model = model_class.from_pretrained(args.model_name_or_path) if paddle.distributed.get_world_size() > 1: @@ -245,9 +236,6 @@ def do_train(args): if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: evaluate(model, dev_data_loader, args, tokenizer) - if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0: - evaluate(model, test_data_loader, args, tokenizer, True) - if __name__ == "__main__": args = parse_args() diff --git a/PaddleNLP/paddlenlp/datasets/dureader.py b/PaddleNLP/paddlenlp/datasets/dureader.py index 81f8e3484cf2e076e5a2cc07168695065dbc2cda..60569d78143e6fd7ff1151683016983f78939e93 100644 --- a/PaddleNLP/paddlenlp/datasets/dureader.py +++ b/PaddleNLP/paddlenlp/datasets/dureader.py @@ -10,7 +10,7 @@ from paddlenlp.utils.env import DATA_HOME from paddle.io import Dataset from .squad import InputFeatures, SQuAD -__all__ = ['DuReader', 'DuReaderYesNo', 'DuReaderRobust'] +__all__ = ['DuReader', 'DuReaderYesNo'] class DuReaderExample(object): @@ -168,63 +168,6 @@ class DuReader(SQuAD): self.examples = examples -class DuReaderRobust(SQuAD): - META_INFO = collections.namedtuple('META_INFO', ('file', 'md5')) - - DATA_URL = 'https://dataset-bj.cdn.bcebos.com/qianyan/dureader_robust-data.tar.gz' - - SPLITS = { - 'train': META_INFO( - os.path.join('dureader_robust-data', 'train.json'), - '800a3dcb742f9fdf9b11e0a83433d4be'), - 'dev': META_INFO( - os.path.join('dureader_robust-data', 'dev.json'), - 'ae73cec081eaa28a735204c4898a2222'), - 'test': META_INFO( - os.path.join('dureader_robust-data', 'test.json'), - 'e0e8aa5c7b6d11b6fc3935e29fc7746f') - } - - def __init__(self, - tokenizer, - mode='train', - version_2_with_negative=True, - root=None, - doc_stride=128, - max_query_length=64, - max_seq_length=512, - **kwargs): - - super(DuReaderRobust, self).__init__( - tokenizer=tokenizer, - mode=mode, - version_2_with_negative=False, - root=root, - doc_stride=doc_stride, - max_query_length=max_query_length, - max_seq_length=max_seq_length, - **kwargs) - - def _get_data(self, root, mode, **kwargs): - default_root = os.path.join(DATA_HOME, 'DuReader') - - filename, data_hash = self.SPLITS[mode] - - fullname = os.path.join(default_root, - filename) if root is None else os.path.join( - os.path.expanduser(root), filename) - if not os.path.exists(fullname) or (data_hash and - not md5file(fullname) == data_hash): - if root is not None: # not specified, and no need to warn - warnings.warn( - 'md5 check failed for {}, download {} data to {}'.format( - filename, self.__class__.__name__, default_root)) - - get_path_from_url(self.DATA_URL, default_root) - - self.full_path = fullname - - class DuReaderYesNo(Dataset): META_INFO = collections.namedtuple('META_INFO', ('file', 'md5')) diff --git a/PaddleNLP/paddlenlp/datasets/squad.py b/PaddleNLP/paddlenlp/datasets/squad.py index 48231a3f7a88ad42a8d4a607cf10e2e46b65cd24..0b390a6c1356e21f58d6ae3df8c4a18dc2de2903 100644 --- a/PaddleNLP/paddlenlp/datasets/squad.py +++ b/PaddleNLP/paddlenlp/datasets/squad.py @@ -8,8 +8,9 @@ from paddle.dataset.common import md5file from paddle.utils.download import get_path_from_url from paddle.io import Dataset from paddlenlp.utils.env import DATA_HOME +from paddlenlp.transformers.tokenizer_utils import _is_whitespace, _is_control, convert_to_unicode -__all__ = ['SQuAD'] +__all__ = ['SQuAD', 'DuReaderRobust', 'CMRC', 'DRCD'] class SquadExample(object): @@ -96,7 +97,7 @@ class SQuAD(Dataset): def __init__(self, tokenizer, mode='train', - version_2_with_negative=True, + version_2_with_negative=False, root=None, doc_stride=128, max_query_length=64, @@ -127,7 +128,7 @@ class SQuAD(Dataset): max_seq_length=self.max_seq_length) def _get_data(self, root, mode, **kwargs): - default_root = os.path.join(DATA_HOME, 'SQuAD') + default_root = os.path.join(DATA_HOME, self.__class__.__name__) if self.version_2_with_negative: filename, data_hash = self.SPLITS['2.0'][mode] else: @@ -166,7 +167,6 @@ class SQuAD(Dataset): features = [] for (example_index, example) in enumerate(examples): query_tokens = tokenizer._tokenize(example.question_text) - if len(query_tokens) > max_query_length: query_tokens = query_tokens[0:max_query_length] @@ -285,7 +285,6 @@ class SQuAD(Dataset): unique_id += 1 features.append(feature) - return features def _improve_answer_span(self, doc_tokens, input_start, input_end, @@ -365,12 +364,6 @@ class SQuAD(Dataset): with open(self.full_path, "r", encoding="utf8") as reader: input_data = json.load(reader)["data"] - def is_whitespace(c): - if c == " " or c == "\t" or c == "\r" or c == "\n" or ord( - c) == 0x202F: - return True - return False - examples = [] for entry in input_data: for paragraph in entry["paragraphs"]: @@ -379,7 +372,7 @@ class SQuAD(Dataset): char_to_word_offset = [] prev_is_whitespace = True for c in paragraph_text: - if is_whitespace(c): + if _is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: @@ -410,8 +403,11 @@ class SQuAD(Dataset): answer_offset = answer["answer_start"] answer_length = len(orig_answer_text) start_position = char_to_word_offset[answer_offset] - end_position = char_to_word_offset[ - answer_offset + answer_length - 1] + try: + end_position = char_to_word_offset[ + answer_offset + answer_length - 1] + except: + continue else: start_position = -1 @@ -428,7 +424,6 @@ class SQuAD(Dataset): else: start_position = -1 end_position = -1 - example = SquadExample( qas_id=qas_id, question_text=question_text, @@ -439,7 +434,7 @@ class SQuAD(Dataset): is_impossible=is_impossible) examples.append(example) - self.examples = examples + self.examples = examples[:1000] def __len__(self): return len(self.features) @@ -451,3 +446,212 @@ class SQuAD(Dataset): return feature.input_ids, feature.segment_ids, feature.unique_id, feature.start_position, feature.end_position else: return feature.input_ids, feature.segment_ids, feature.unique_id + + +class DuReaderRobust(SQuAD): + META_INFO = collections.namedtuple('META_INFO', ('file', 'md5')) + + DATA_URL = 'https://dataset-bj.cdn.bcebos.com/qianyan/dureader_robust-data.tar.gz' + + SPLITS = { + 'train': META_INFO( + os.path.join('dureader_robust-data', 'train.json'), + '800a3dcb742f9fdf9b11e0a83433d4be'), + 'dev': META_INFO( + os.path.join('dureader_robust-data', 'dev.json'), + 'ae73cec081eaa28a735204c4898a2222'), + 'test': META_INFO( + os.path.join('dureader_robust-data', 'test.json'), + 'e0e8aa5c7b6d11b6fc3935e29fc7746f') + } + + def __init__(self, + tokenizer, + mode='train', + root=None, + doc_stride=128, + max_query_length=64, + max_seq_length=512, + **kwargs): + + super(DuReaderRobust, self).__init__( + tokenizer=tokenizer, + mode=mode, + version_2_with_negative=False, + root=root, + doc_stride=doc_stride, + max_query_length=max_query_length, + max_seq_length=max_seq_length, + **kwargs) + + def _get_data(self, root, mode, **kwargs): + default_root = os.path.join(DATA_HOME, 'self.__class__.__name__') + + filename, data_hash = self.SPLITS[mode] + + fullname = os.path.join(default_root, + filename) if root is None else os.path.join( + os.path.expanduser(root), filename) + if not os.path.exists(fullname) or (data_hash and + not md5file(fullname) == data_hash): + if root is not None: # not specified, and no need to warn + warnings.warn( + 'md5 check failed for {}, download {} data to {}'.format( + filename, self.__class__.__name__, default_root)) + + get_path_from_url(self.DATA_URL, default_root) + + self.full_path = fullname + + def _read(self): + with open(self.full_path, "r", encoding="utf8") as reader: + input_data = json.load(reader)["data"] + + examples = [] + for entry in input_data: + for paragraph in entry["paragraphs"]: + paragraph_text = paragraph["context"] + raw_doc_tokens = self.tokenizer.basic_tokenizer.tokenize( + paragraph_text) + doc_tokens = [] + char_to_word_offset = [] + k = 0 + temp_word = "" + for c in paragraph_text: + if not self.tokenizer.basic_tokenizer.tokenize(c): + char_to_word_offset.append(k - 1) + continue + else: + temp_word += c + char_to_word_offset.append(k) + + if temp_word == raw_doc_tokens[k]: + doc_tokens.append(temp_word) + temp_word = "" + k += 1 + + assert k == len(raw_doc_tokens) + + for qa in paragraph["qas"]: + qas_id = qa["id"] + question_text = qa["question"] + start_position = None + end_position = None + orig_answer_text = None + is_impossible = False + + if self.is_training: + if (len(qa["answers"]) != 1): + raise ValueError( + "For training, each question should have exactly 1 answer." + ) + + answer = qa["answers"][0] + orig_answer_text = answer["text"] + answer_offset = answer["answer_start"] + answer_length = len(orig_answer_text) + start_position = char_to_word_offset[answer_offset] + try: + end_position = char_to_word_offset[ + answer_offset + answer_length - 1] + except: + continue + + else: + orig_answer_text = [] + if 'answers' in qa.keys(): + answers = qa["answers"] + for answer in answers: + orig_answer_text.append(answer["text"]) + + example = SquadExample( + qas_id=qas_id, + question_text=question_text, + doc_tokens=doc_tokens, + orig_answer_text=orig_answer_text, + start_position=start_position, + end_position=end_position, + is_impossible=is_impossible) + examples.append(example) + + self.examples = examples + + +class CMRC(DuReaderRobust): + META_INFO = collections.namedtuple('META_INFO', ('file', 'md5')) + + DEV_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_dev.json' + TRAIN_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_train.json' + TRIAL_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_trial.json' + + SPLITS = { + 'train': META_INFO( + os.path.join('cmrc2018_train.json'), + '7fb714b479c7f40fbb16acabd7af0ede'), + 'dev': META_INFO( + os.path.join('cmrc2018_dev.json'), + '853b80709ff2d071f9fce196521b843c'), + 'trial': META_INFO( + os.path.join('cmrc2018_trial.json'), + '853b80709ff2d071f9fce196521b843c') + } + + def _get_data(self, root, mode, **kwargs): + default_root = os.path.join(DATA_HOME, self.__class__.__name__) + + filename, data_hash = self.SPLITS[mode] + fullname = os.path.join(default_root, + filename) if root is None else os.path.join( + os.path.expanduser(root), filename) + if not os.path.exists(fullname) or (data_hash and + not md5file(fullname) == data_hash): + if root is not None: # not specified, and no need to warn + warnings.warn( + 'md5 check failed for {}, download {} data to {}'.format( + filename, self.__class__.__name__, default_root)) + if mode == 'train': + fullname = get_path_from_url(self.TRAIN_DATA_URL, default_root) + elif mode == 'dev': + fullname = get_path_from_url(self.DEV_DATA_URL, default_root) + elif mode == 'trial': + fullname = get_path_from_url(self.TRIAL_DATA_URL, default_root) + self.full_path = fullname + + +class DRCD(DuReaderRobust): + META_INFO = collections.namedtuple('META_INFO', ('file', 'md5')) + + DEV_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_dev.json' + TRAIN_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_training.json' + TEST_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_test.json' + + SPLITS = { + 'train': META_INFO( + os.path.join('DRCD_dev.json'), '7fb714b479c7f40fbb16acabd7af0ede'), + 'dev': META_INFO( + os.path.join('DRCD_training.json'), + '853b80709ff2d071f9fce196521b843c'), + 'test': META_INFO( + os.path.join('DRCD_test.json'), '853b80709ff2d071f9fce196521b843c') + } + + def _get_data(self, root, mode, **kwargs): + default_root = os.path.join(DATA_HOME, self.__class__.__name__) + + filename, data_hash = self.SPLITS[mode] + fullname = os.path.join(default_root, + filename) if root is None else os.path.join( + os.path.expanduser(root), filename) + if not os.path.exists(fullname) or (data_hash and + not md5file(fullname) == data_hash): + if root is not None: # not specified, and no need to warn + warnings.warn( + 'md5 check failed for {}, download {} data to {}'.format( + filename, self.__class__.__name__, default_root)) + if mode == 'train': + fullname = get_path_from_url(self.TRAIN_DATA_URL, default_root) + elif mode == 'dev': + fullname = get_path_from_url(self.DEV_DATA_URL, default_root) + elif mode == 'test': + fullname = get_path_from_url(self.TEST_DATA_URL, default_root) + self.full_path = fullname diff --git a/PaddleNLP/paddlenlp/metrics/squad.py b/PaddleNLP/paddlenlp/metrics/squad.py index 44aa7158f423349f4755be0c857b3dda629c6208..23b55b82ef4e42899a2a8df755045c0fbf0f0b4e 100644 --- a/PaddleNLP/paddlenlp/metrics/squad.py +++ b/PaddleNLP/paddlenlp/metrics/squad.py @@ -14,10 +14,17 @@ import os import math -def compute_predictions(all_examples, all_features, all_results, n_best_size, - max_answer_length, do_lower_case, - version_2_with_negative, null_score_diff_threshold, - verbose, tokenizer): +def compute_predictions(all_examples, + all_features, + all_results, + n_best_size, + max_answer_length, + do_lower_case, + version_2_with_negative, + null_score_diff_threshold, + verbose, + tokenizer, + is_whitespace_splited=True): """Write final predictions to the json file and log-odds of null if needed.""" example_index_to_features = collections.defaultdict(list) @@ -130,6 +137,8 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size, orig_text = " ".join(orig_tokens) final_text = get_final_text(tok_text, orig_text, tokenizer, verbose) + if not is_whitespace_splited: + final_text = final_text.replace(' ', '') if final_text in seen_predictions: continue @@ -184,7 +193,6 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size, nbest_json.append(output) assert len(nbest_json) >= 1 - if not version_2_with_negative: all_predictions[example.qas_id] = nbest_json[0]["text"] else: @@ -246,7 +254,6 @@ def get_final_text(pred_text, orig_text, tokenizer, verbose): # and `pred_text`, and check if they are the same length. If they are # NOT the same length, the heuristic has failed. If they are the same # length, we assume the characters are one-to-one aligned. - tok_text = " ".join(tokenizer.basic_tokenizer.tokenize(orig_text)) start_position = tok_text.find(pred_text) if start_position == -1: @@ -356,21 +363,24 @@ def normalize_answer(s): def lower(text): return text.lower() - return white_space_fix(remove_articles(remove_punc(lower(s)))) - - -def get_tokens(s): - if not s: return [] - return normalize_answer(s).split() + if not s: + return [] + else: + return white_space_fix(remove_articles(remove_punc(lower(s)))) def compute_exact(a_gold, a_pred): return int(normalize_answer(a_gold) == normalize_answer(a_pred)) -def compute_f1(a_gold, a_pred): - gold_toks = get_tokens(a_gold) - pred_toks = get_tokens(a_pred) +def compute_f1(a_gold, a_pred, is_whitespace_splited=True): + gold_toks = normalize_answer(a_gold).split() + pred_toks = normalize_answer(a_pred).split() + + if not is_whitespace_splited: + gold_toks = gold_toks[0] + pred_toks = pred_toks[0] + common = collections.Counter(gold_toks) & collections.Counter(pred_toks) num_same = sum(common.values()) if len(gold_toks) == 0 or len(pred_toks) == 0: @@ -384,7 +394,7 @@ def compute_f1(a_gold, a_pred): return f1 -def get_raw_scores(examples, preds): +def get_raw_scores(examples, preds, is_whitespace_splited=True): exact_scores = {} f1_scores = {} for example in examples: @@ -399,9 +409,12 @@ def get_raw_scores(examples, preds): print('Missing prediction for %s' % qid) continue a_pred = preds[qid] + # Take max over all gold answers exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) - f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) + f1_scores[qid] = max( + compute_f1(a, a_pred, is_whitespace_splited) for a in gold_answers) + return exact_scores, f1_scores @@ -472,14 +485,18 @@ def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, main_eval['best_f1_thresh'] = f1_thresh -def squad_evaluate(examples, preds, na_probs=None, na_prob_thresh=1.0): +def squad_evaluate(examples, + preds, + na_probs=None, + na_prob_thresh=1.0, + is_whitespace_splited=True): if not na_probs: na_probs = {k: 0.0 for k in preds} qid_to_has_ans = make_qid_to_has_ans(examples) # maps qid to True/False has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] - exact_raw, f1_raw = get_raw_scores(examples, preds) + exact_raw, f1_raw = get_raw_scores(examples, preds, is_whitespace_splited) exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, na_prob_thresh) f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,