未验证 提交 4fea9819 编写于 作者: S smallv0221 提交者: GitHub

Add 2 datasets and update squad metric and run_du (#5060)

* update lrscheduler

* minor fix

* add pre-commit

* minor fix

* Add __len__ to squad dataset

* minor fix

* Add dureader robust prototype

* dataset implement

* minor fix

* fix var name

* add dureader-yesno train script and dataset

* add readme and fix md5sum

* integrete dureader datasets

* change var names: segment to mode, root to data_file

* minor fix

* update var name

* Fix api bugs

* add dataset readme

* add express ner

* update readme format

* fix format bug

* change readme path

* fix format bug

* fix dataset bug

* add 2 datasets and update squad metric
上级 65c1f0b9
......@@ -3,6 +3,12 @@ import argparse
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--task_name",
default=None,
type=str,
required=True,
help="The name of the task.")
parser.add_argument(
"--data_path",
type=str,
......
......@@ -26,10 +26,13 @@ from args import parse_args
import json
import paddlenlp as ppnlp
from paddlenlp.datasets import SQuAD, DuReaderRobust, CMRC, DRCD
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer
from paddlenlp.metrics.squad import squad_evaluate, compute_predictions
TASK_CLASSES = {"dureader-robust": DuReaderRobust, "cmrc": CMRC, "drcd": DRCD}
MODEL_CLASSES = {
"bert": (BertForQuestionAnswering, BertTokenizer),
"ernie": (ErnieForQuestionAnswering, ErnieTokenizer)
......@@ -89,18 +92,20 @@ def evaluate(model, data_loader, args, tokenizer, do_pred=False):
start_logits=start_logits,
end_logits=end_logits))
all_predictions, _, scores_diff_json = compute_predictions(
all_predictions, _, _ = compute_predictions(
data_loader.dataset.examples, data_loader.dataset.features, all_results,
args.n_best_size, args.max_answer_length, args.do_lower_case, False,
0.0, args.verbose, tokenizer)
0.0, args.verbose, tokenizer, False)
if do_pred:
with open('prediction.json', "w") as writer:
with open('prediction.json', "w", encoding='utf-8') as writer:
writer.write(
json.dumps(
all_predictions, ensure_ascii=False, indent=4) + "\n")
else:
squad_evaluate(data_loader.dataset.examples, all_predictions,
scores_diff_json, 1.0)
squad_evaluate(
examples=data_loader.dataset.examples,
preds=all_predictions,
is_whitespace_splited=False)
model.train()
......@@ -110,13 +115,16 @@ def do_train(args):
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
task_name = args.task_name.lower()
dataset_class = TASK_CLASSES[task_name]
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
root = args.data_path
set_seed(args)
train_ds = ppnlp.datasets.DuReaderRobust(
train_ds = dataset_class(
tokenizer=tokenizer,
root=root,
doc_stride=args.doc_stride,
......@@ -141,7 +149,7 @@ def do_train(args):
collate_fn=train_batchify_fn,
return_list=True)
dev_ds = ppnlp.datasets.DuReaderRobust(
dev_ds = dataset_class(
tokenizer=tokenizer,
root=root,
doc_stride=args.doc_stride,
......@@ -164,23 +172,6 @@ def do_train(args):
collate_fn=dev_batchify_fn,
return_list=True)
test_ds = ppnlp.datasets.DuReaderRobust(
tokenizer=tokenizer,
root=root,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
max_seq_length=args.max_seq_length,
mode='test')
test_batch_sampler = paddle.io.BatchSampler(
test_ds, batch_size=args.batch_size, shuffle=False)
test_data_loader = DataLoader(
dataset=test_ds,
batch_sampler=test_batch_sampler,
collate_fn=dev_batchify_fn,
return_list=True)
model = model_class.from_pretrained(args.model_name_or_path)
if paddle.distributed.get_world_size() > 1:
......@@ -245,9 +236,6 @@ def do_train(args):
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, dev_data_loader, args, tokenizer)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, test_data_loader, args, tokenizer, True)
if __name__ == "__main__":
args = parse_args()
......
......@@ -10,7 +10,7 @@ from paddlenlp.utils.env import DATA_HOME
from paddle.io import Dataset
from .squad import InputFeatures, SQuAD
__all__ = ['DuReader', 'DuReaderYesNo', 'DuReaderRobust']
__all__ = ['DuReader', 'DuReaderYesNo']
class DuReaderExample(object):
......@@ -168,63 +168,6 @@ class DuReader(SQuAD):
self.examples = examples
class DuReaderRobust(SQuAD):
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
DATA_URL = 'https://dataset-bj.cdn.bcebos.com/qianyan/dureader_robust-data.tar.gz'
SPLITS = {
'train': META_INFO(
os.path.join('dureader_robust-data', 'train.json'),
'800a3dcb742f9fdf9b11e0a83433d4be'),
'dev': META_INFO(
os.path.join('dureader_robust-data', 'dev.json'),
'ae73cec081eaa28a735204c4898a2222'),
'test': META_INFO(
os.path.join('dureader_robust-data', 'test.json'),
'e0e8aa5c7b6d11b6fc3935e29fc7746f')
}
def __init__(self,
tokenizer,
mode='train',
version_2_with_negative=True,
root=None,
doc_stride=128,
max_query_length=64,
max_seq_length=512,
**kwargs):
super(DuReaderRobust, self).__init__(
tokenizer=tokenizer,
mode=mode,
version_2_with_negative=False,
root=root,
doc_stride=doc_stride,
max_query_length=max_query_length,
max_seq_length=max_seq_length,
**kwargs)
def _get_data(self, root, mode, **kwargs):
default_root = os.path.join(DATA_HOME, 'DuReader')
filename, data_hash = self.SPLITS[mode]
fullname = os.path.join(default_root,
filename) if root is None else os.path.join(
os.path.expanduser(root), filename)
if not os.path.exists(fullname) or (data_hash and
not md5file(fullname) == data_hash):
if root is not None: # not specified, and no need to warn
warnings.warn(
'md5 check failed for {}, download {} data to {}'.format(
filename, self.__class__.__name__, default_root))
get_path_from_url(self.DATA_URL, default_root)
self.full_path = fullname
class DuReaderYesNo(Dataset):
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
......
......@@ -8,8 +8,9 @@ from paddle.dataset.common import md5file
from paddle.utils.download import get_path_from_url
from paddle.io import Dataset
from paddlenlp.utils.env import DATA_HOME
from paddlenlp.transformers.tokenizer_utils import _is_whitespace, _is_control, convert_to_unicode
__all__ = ['SQuAD']
__all__ = ['SQuAD', 'DuReaderRobust', 'CMRC', 'DRCD']
class SquadExample(object):
......@@ -96,7 +97,7 @@ class SQuAD(Dataset):
def __init__(self,
tokenizer,
mode='train',
version_2_with_negative=True,
version_2_with_negative=False,
root=None,
doc_stride=128,
max_query_length=64,
......@@ -127,7 +128,7 @@ class SQuAD(Dataset):
max_seq_length=self.max_seq_length)
def _get_data(self, root, mode, **kwargs):
default_root = os.path.join(DATA_HOME, 'SQuAD')
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
if self.version_2_with_negative:
filename, data_hash = self.SPLITS['2.0'][mode]
else:
......@@ -166,7 +167,6 @@ class SQuAD(Dataset):
features = []
for (example_index, example) in enumerate(examples):
query_tokens = tokenizer._tokenize(example.question_text)
if len(query_tokens) > max_query_length:
query_tokens = query_tokens[0:max_query_length]
......@@ -285,7 +285,6 @@ class SQuAD(Dataset):
unique_id += 1
features.append(feature)
return features
def _improve_answer_span(self, doc_tokens, input_start, input_end,
......@@ -365,12 +364,6 @@ class SQuAD(Dataset):
with open(self.full_path, "r", encoding="utf8") as reader:
input_data = json.load(reader)["data"]
def is_whitespace(c):
if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(
c) == 0x202F:
return True
return False
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
......@@ -379,7 +372,7 @@ class SQuAD(Dataset):
char_to_word_offset = []
prev_is_whitespace = True
for c in paragraph_text:
if is_whitespace(c):
if _is_whitespace(c):
prev_is_whitespace = True
else:
if prev_is_whitespace:
......@@ -410,8 +403,11 @@ class SQuAD(Dataset):
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
end_position = char_to_word_offset[
answer_offset + answer_length - 1]
try:
end_position = char_to_word_offset[
answer_offset + answer_length - 1]
except:
continue
else:
start_position = -1
......@@ -428,7 +424,6 @@ class SQuAD(Dataset):
else:
start_position = -1
end_position = -1
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
......@@ -439,7 +434,7 @@ class SQuAD(Dataset):
is_impossible=is_impossible)
examples.append(example)
self.examples = examples
self.examples = examples[:1000]
def __len__(self):
return len(self.features)
......@@ -451,3 +446,212 @@ class SQuAD(Dataset):
return feature.input_ids, feature.segment_ids, feature.unique_id, feature.start_position, feature.end_position
else:
return feature.input_ids, feature.segment_ids, feature.unique_id
class DuReaderRobust(SQuAD):
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
DATA_URL = 'https://dataset-bj.cdn.bcebos.com/qianyan/dureader_robust-data.tar.gz'
SPLITS = {
'train': META_INFO(
os.path.join('dureader_robust-data', 'train.json'),
'800a3dcb742f9fdf9b11e0a83433d4be'),
'dev': META_INFO(
os.path.join('dureader_robust-data', 'dev.json'),
'ae73cec081eaa28a735204c4898a2222'),
'test': META_INFO(
os.path.join('dureader_robust-data', 'test.json'),
'e0e8aa5c7b6d11b6fc3935e29fc7746f')
}
def __init__(self,
tokenizer,
mode='train',
root=None,
doc_stride=128,
max_query_length=64,
max_seq_length=512,
**kwargs):
super(DuReaderRobust, self).__init__(
tokenizer=tokenizer,
mode=mode,
version_2_with_negative=False,
root=root,
doc_stride=doc_stride,
max_query_length=max_query_length,
max_seq_length=max_seq_length,
**kwargs)
def _get_data(self, root, mode, **kwargs):
default_root = os.path.join(DATA_HOME, 'self.__class__.__name__')
filename, data_hash = self.SPLITS[mode]
fullname = os.path.join(default_root,
filename) if root is None else os.path.join(
os.path.expanduser(root), filename)
if not os.path.exists(fullname) or (data_hash and
not md5file(fullname) == data_hash):
if root is not None: # not specified, and no need to warn
warnings.warn(
'md5 check failed for {}, download {} data to {}'.format(
filename, self.__class__.__name__, default_root))
get_path_from_url(self.DATA_URL, default_root)
self.full_path = fullname
def _read(self):
with open(self.full_path, "r", encoding="utf8") as reader:
input_data = json.load(reader)["data"]
examples = []
for entry in input_data:
for paragraph in entry["paragraphs"]:
paragraph_text = paragraph["context"]
raw_doc_tokens = self.tokenizer.basic_tokenizer.tokenize(
paragraph_text)
doc_tokens = []
char_to_word_offset = []
k = 0
temp_word = ""
for c in paragraph_text:
if not self.tokenizer.basic_tokenizer.tokenize(c):
char_to_word_offset.append(k - 1)
continue
else:
temp_word += c
char_to_word_offset.append(k)
if temp_word == raw_doc_tokens[k]:
doc_tokens.append(temp_word)
temp_word = ""
k += 1
assert k == len(raw_doc_tokens)
for qa in paragraph["qas"]:
qas_id = qa["id"]
question_text = qa["question"]
start_position = None
end_position = None
orig_answer_text = None
is_impossible = False
if self.is_training:
if (len(qa["answers"]) != 1):
raise ValueError(
"For training, each question should have exactly 1 answer."
)
answer = qa["answers"][0]
orig_answer_text = answer["text"]
answer_offset = answer["answer_start"]
answer_length = len(orig_answer_text)
start_position = char_to_word_offset[answer_offset]
try:
end_position = char_to_word_offset[
answer_offset + answer_length - 1]
except:
continue
else:
orig_answer_text = []
if 'answers' in qa.keys():
answers = qa["answers"]
for answer in answers:
orig_answer_text.append(answer["text"])
example = SquadExample(
qas_id=qas_id,
question_text=question_text,
doc_tokens=doc_tokens,
orig_answer_text=orig_answer_text,
start_position=start_position,
end_position=end_position,
is_impossible=is_impossible)
examples.append(example)
self.examples = examples
class CMRC(DuReaderRobust):
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
DEV_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_dev.json'
TRAIN_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_train.json'
TRIAL_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/cmrc/cmrc2018_trial.json'
SPLITS = {
'train': META_INFO(
os.path.join('cmrc2018_train.json'),
'7fb714b479c7f40fbb16acabd7af0ede'),
'dev': META_INFO(
os.path.join('cmrc2018_dev.json'),
'853b80709ff2d071f9fce196521b843c'),
'trial': META_INFO(
os.path.join('cmrc2018_trial.json'),
'853b80709ff2d071f9fce196521b843c')
}
def _get_data(self, root, mode, **kwargs):
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
filename, data_hash = self.SPLITS[mode]
fullname = os.path.join(default_root,
filename) if root is None else os.path.join(
os.path.expanduser(root), filename)
if not os.path.exists(fullname) or (data_hash and
not md5file(fullname) == data_hash):
if root is not None: # not specified, and no need to warn
warnings.warn(
'md5 check failed for {}, download {} data to {}'.format(
filename, self.__class__.__name__, default_root))
if mode == 'train':
fullname = get_path_from_url(self.TRAIN_DATA_URL, default_root)
elif mode == 'dev':
fullname = get_path_from_url(self.DEV_DATA_URL, default_root)
elif mode == 'trial':
fullname = get_path_from_url(self.TRIAL_DATA_URL, default_root)
self.full_path = fullname
class DRCD(DuReaderRobust):
META_INFO = collections.namedtuple('META_INFO', ('file', 'md5'))
DEV_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_dev.json'
TRAIN_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_training.json'
TEST_DATA_URL = 'https://paddlenlp.bj.bcebos.com/datasets/DRCD/DRCD_test.json'
SPLITS = {
'train': META_INFO(
os.path.join('DRCD_dev.json'), '7fb714b479c7f40fbb16acabd7af0ede'),
'dev': META_INFO(
os.path.join('DRCD_training.json'),
'853b80709ff2d071f9fce196521b843c'),
'test': META_INFO(
os.path.join('DRCD_test.json'), '853b80709ff2d071f9fce196521b843c')
}
def _get_data(self, root, mode, **kwargs):
default_root = os.path.join(DATA_HOME, self.__class__.__name__)
filename, data_hash = self.SPLITS[mode]
fullname = os.path.join(default_root,
filename) if root is None else os.path.join(
os.path.expanduser(root), filename)
if not os.path.exists(fullname) or (data_hash and
not md5file(fullname) == data_hash):
if root is not None: # not specified, and no need to warn
warnings.warn(
'md5 check failed for {}, download {} data to {}'.format(
filename, self.__class__.__name__, default_root))
if mode == 'train':
fullname = get_path_from_url(self.TRAIN_DATA_URL, default_root)
elif mode == 'dev':
fullname = get_path_from_url(self.DEV_DATA_URL, default_root)
elif mode == 'test':
fullname = get_path_from_url(self.TEST_DATA_URL, default_root)
self.full_path = fullname
......@@ -14,10 +14,17 @@ import os
import math
def compute_predictions(all_examples, all_features, all_results, n_best_size,
max_answer_length, do_lower_case,
version_2_with_negative, null_score_diff_threshold,
verbose, tokenizer):
def compute_predictions(all_examples,
all_features,
all_results,
n_best_size,
max_answer_length,
do_lower_case,
version_2_with_negative,
null_score_diff_threshold,
verbose,
tokenizer,
is_whitespace_splited=True):
"""Write final predictions to the json file and log-odds of null if needed."""
example_index_to_features = collections.defaultdict(list)
......@@ -130,6 +137,8 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size,
orig_text = " ".join(orig_tokens)
final_text = get_final_text(tok_text, orig_text, tokenizer,
verbose)
if not is_whitespace_splited:
final_text = final_text.replace(' ', '')
if final_text in seen_predictions:
continue
......@@ -184,7 +193,6 @@ def compute_predictions(all_examples, all_features, all_results, n_best_size,
nbest_json.append(output)
assert len(nbest_json) >= 1
if not version_2_with_negative:
all_predictions[example.qas_id] = nbest_json[0]["text"]
else:
......@@ -246,7 +254,6 @@ def get_final_text(pred_text, orig_text, tokenizer, verbose):
# and `pred_text`, and check if they are the same length. If they are
# NOT the same length, the heuristic has failed. If they are the same
# length, we assume the characters are one-to-one aligned.
tok_text = " ".join(tokenizer.basic_tokenizer.tokenize(orig_text))
start_position = tok_text.find(pred_text)
if start_position == -1:
......@@ -356,21 +363,24 @@ def normalize_answer(s):
def lower(text):
return text.lower()
return white_space_fix(remove_articles(remove_punc(lower(s))))
def get_tokens(s):
if not s: return []
return normalize_answer(s).split()
if not s:
return []
else:
return white_space_fix(remove_articles(remove_punc(lower(s))))
def compute_exact(a_gold, a_pred):
return int(normalize_answer(a_gold) == normalize_answer(a_pred))
def compute_f1(a_gold, a_pred):
gold_toks = get_tokens(a_gold)
pred_toks = get_tokens(a_pred)
def compute_f1(a_gold, a_pred, is_whitespace_splited=True):
gold_toks = normalize_answer(a_gold).split()
pred_toks = normalize_answer(a_pred).split()
if not is_whitespace_splited:
gold_toks = gold_toks[0]
pred_toks = pred_toks[0]
common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
num_same = sum(common.values())
if len(gold_toks) == 0 or len(pred_toks) == 0:
......@@ -384,7 +394,7 @@ def compute_f1(a_gold, a_pred):
return f1
def get_raw_scores(examples, preds):
def get_raw_scores(examples, preds, is_whitespace_splited=True):
exact_scores = {}
f1_scores = {}
for example in examples:
......@@ -399,9 +409,12 @@ def get_raw_scores(examples, preds):
print('Missing prediction for %s' % qid)
continue
a_pred = preds[qid]
# Take max over all gold answers
exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
f1_scores[qid] = max(
compute_f1(a, a_pred, is_whitespace_splited) for a in gold_answers)
return exact_scores, f1_scores
......@@ -472,14 +485,18 @@ def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs,
main_eval['best_f1_thresh'] = f1_thresh
def squad_evaluate(examples, preds, na_probs=None, na_prob_thresh=1.0):
def squad_evaluate(examples,
preds,
na_probs=None,
na_prob_thresh=1.0,
is_whitespace_splited=True):
if not na_probs:
na_probs = {k: 0.0 for k in preds}
qid_to_has_ans = make_qid_to_has_ans(examples) # maps qid to True/False
has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
exact_raw, f1_raw = get_raw_scores(examples, preds)
exact_raw, f1_raw = get_raw_scores(examples, preds, is_whitespace_splited)
exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
na_prob_thresh)
f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册