未验证 提交 26a0cd1e 编写于 作者: S smallv0221 提交者: GitHub

Add DuReader yesno and robust (#4992)

* update lrscheduler

* minor fix

* add pre-commit

* minor fix

* Add __len__ to squad dataset

* minor fix

* Add dureader robust prototype

* dataset implement

* minor fix

* fix var name

* add dureader-yesno train script and dataset

* add readme and fix md5sum

* integrete dureader datasets
上级 2b2147b0
......@@ -146,13 +146,13 @@ def set_seed(args):
paddle.seed(args.seed + paddle.distributed.get_rank())
def evaluate(model, loss_fct, metric, data_loader):
def evaluate(model, criterion, metric, data_loader):
model.eval()
metric.reset()
for batch in data_loader:
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
loss = loss_fct(logits, labels)
loss = criterion(logits, labels)
correct = metric.compute(logits, labels)
metric.update(correct)
accu = metric.accumulate()
......@@ -310,20 +310,11 @@ def do_train(args):
if not any(nd in n for nd in ["bias", "norm"])
])
loss_fct = paddle.nn.loss.CrossEntropyLoss() if train_ds.get_labels(
criterion = paddle.nn.loss.CrossEntropyLoss() if train_ds.get_labels(
) else paddle.nn.loss.MSELoss()
metric = metric_class()
### TODO: use hapi
# trainer = paddle.hapi.Model(model)
# trainer.prepare(optimizer, loss_fct, paddle.metric.Accuracy())
# trainer.fit(train_data_loader,
# dev_data_loader,
# log_freq=args.logging_steps,
# epochs=args.num_train_epochs,
# save_dir=args.output_dir)
global_step = 0
tic_train = time.time()
for epoch in range(args.num_train_epochs):
......@@ -331,7 +322,7 @@ def do_train(args):
global_step += 1
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
loss = loss_fct(logits, labels)
loss = criterion(logits, labels)
if global_step % args.logging_steps == 0:
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
logger.info(
......@@ -344,7 +335,7 @@ def do_train(args):
lr_scheduler.step()
optimizer.clear_gradients()
if global_step % args.save_steps == 0:
evaluate(model, loss_fct, metric, dev_data_loader)
evaluate(model, criterion, metric, dev_data_loader)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
output_dir = os.path.join(args.output_dir,
"model_%d" % global_step)
......
# 阅读理解 DuReader-robust
# 简介
## 1. 任务说明
阅读理解模型的鲁棒性是衡量该技术能否在实际应用中大规模落地的重要指标之一。随着当前技术的进步,模型虽然能够在一些阅读理解测试集上取得较好的性能,但在实际应用中,这些模型所表现出的鲁棒性仍然难以令人满意。DuReader-robust数据集作为首个关注阅读理解模型鲁棒性的中文数据集,旨在考察模型在真实应用场景中的过敏感性、过稳定性以及泛化能力等问题。
**目前语言模型要求使用PaddlePaddle 2.0及以上版本或适当的develop版本。**
## 2. 数据集
DuReaderrobust数据集是单篇章、抽取式阅读理解数据集,具体的任务定义为:
对于一个给定的问题q和一个篇章p,参赛系统需要根据篇章内容,给出该问题的答案a。数据集中的每个样本,是一个三元组<q, p, a>,例如:
**问题 q**: 乔丹打了多少个赛季
**篇章 p**: 迈克尔.乔丹在NBA打了15个赛季。他在84年进入nba,期间在1993年10月6日第一次退役改打棒球,95年3月18日重新回归,在99年1月13日第二次退役,后于2001年10月31日复出,在03年最终退役…
**参考答案 (a)**: [‘15个’,‘15个赛季’]
关于该数据集的详细内容,可参考数据集[论文](https://arxiv.org/abs/2004.11142)
# 快速开始
## 1. 开始第一次模型调用
### 数据准备
为了方便开发者进行测试,我们内置了数据下载脚本,也可以通过`--data_path`传入本地数据集的位置,数据集需保证与DuReader-robust数据集格式一致。
### Fine-tune
按如下方式启动 Fine-tuning:
```shell
python -u ./run_du.py \
--model_type bert \
--model_name_or_path bert-base-chinese \
--max_seq_length 384 \
--batch_size 12 \
--learning_rate 3e-5 \
--num_train_epochs 2 \
--logging_steps 1000 \
--save_steps 1000 \
--warmup_proportion 0.1 \
--weight_decay 0.01 \
--output_dir ./tmp/dureader-robust/ \
--n_gpu 1 \
```
* `model_type`: 预训练模型的种类。如bert,ernie,roberta等。
* `model_name_or_path`: 预训练模型的具体名称。如bert-base-uncased,bert-large-cased等。或者是模型文件的本地路径。
* `output_dir`: 保存模型checkpoint的路径。
训练结束后模型会自动对结果进行评估,得到类似如下的输出:
```text
{
"exact": 66.97247706422019,
"f1": 67.26064455422254,
"total": 1417,
"HasAns_exact": 66.97247706422019,
"HasAns_f1": 67.26064455422254,
"HasAns_total": 1417
}
```
评估结束后模型会自动对测试集进行预测,并将可提交的结果生成在`prediction.json`中。
**NOTE:** 如需恢复模型训练,则model_name_or_path只需指定到文件夹名即可。如`--model_name_or_path=./tmp/dureader-robust/model_19000/`,程序会自动加载模型参数`/model_state.pdparams`,也会自动加载词表,模型config和tokenizer的config。
## 2. 目录结构
```text
.
├── README.md # 文档
├── run_du.py # 训练代码
├── args.py # 参数读取
```
# 其他
## 如何贡献代码
如果你可以修复某个issue或者增加一个新功能,欢迎给我们提交PR。如果对应的PR被接受了,我们将根据贡献的质量和难度进行打分(0-5分,越高越好)。如果你累计获得了10分,可以联系我们获得面试机会或者为你写推荐信。
import argparse
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--data_path",
type=str,
default=None,
help="Directory of all the data for train, valid, test.")
parser.add_argument(
"--model_type",
default=None,
type=str,
required=True,
help="Type of pre-trained model.")
parser.add_argument(
"--model_name_or_path",
default=None,
type=str,
required=True,
help="Path to pre-trained model or shortcut name of model.")
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model predictions and checkpoints will be written."
)
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.")
parser.add_argument(
"--batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument(
"--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument(
"--weight_decay",
default=0.0,
type=float,
help="Weight decay if we apply some.")
parser.add_argument(
"--adam_epsilon",
default=1e-8,
type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument(
"--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--num_train_epochs",
default=3,
type=int,
help="Total number of training epochs to perform.")
parser.add_argument(
"--max_steps",
default=-1,
type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs."
)
parser.add_argument(
"--warmup_proportion",
default=0.0,
type=float,
help="Proportion of training steps to perform linear learning rate warmup for."
)
parser.add_argument(
"--logging_steps",
type=int,
default=500,
help="Log every X updates steps.")
parser.add_argument(
"--save_steps",
type=int,
default=500,
help="Save checkpoint every X updates steps.")
parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument(
"--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
parser.add_argument(
"--doc_stride",
type=int,
default=128,
help="When splitting up a long document into chunks, how much stride to take between chunks."
)
parser.add_argument(
"--n_best_size",
type=int,
default=20,
help="The total number of n-best predictions to generate in the nbest_predictions.json output file."
)
parser.add_argument(
"--max_query_length", type=int, default=64, help="Max query length.")
parser.add_argument(
"--max_answer_length", type=int, default=30, help="Max answer length.")
parser.add_argument(
"--do_lower_case",
action='store_false',
help="Whether to lower case the input text. Should be True for uncased models and False for cased models."
)
parser.add_argument(
"--verbose", action='store_true', help="Whether to output verbose log.")
args = parser.parse_args()
return args
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
import time
from functools import partial
import numpy as np
import paddle
from paddle.io import DataLoader
from args import parse_args
import json
import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForQuestionAnswering, BertTokenizer, ErnieForQuestionAnswering, ErnieTokenizer
from paddlenlp.metrics.squad import squad_evaluate, compute_predictions
MODEL_CLASSES = {
"bert": (BertForQuestionAnswering, BertTokenizer),
"ernie": (ErnieForQuestionAnswering, ErnieTokenizer)
}
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
paddle.seed(args.seed)
class CrossEntropyLossForSQuAD(paddle.nn.Layer):
def __init__(self):
super(CrossEntropyLossForSQuAD, self).__init__()
def forward(self, y, label):
start_logits, end_logits = y
start_position, end_position = label
start_position = paddle.unsqueeze(start_position, axis=-1)
end_position = paddle.unsqueeze(end_position, axis=-1)
start_loss = paddle.nn.functional.softmax_with_cross_entropy(
logits=start_logits, label=start_position, soft_label=False)
start_loss = paddle.mean(start_loss)
end_loss = paddle.nn.functional.softmax_with_cross_entropy(
logits=end_logits, label=end_position, soft_label=False)
end_loss = paddle.mean(end_loss)
loss = (start_loss + end_loss) / 2
return loss
def evaluate(model, data_loader, args, tokenizer, do_pred=False):
model.eval()
RawResult = collections.namedtuple(
"RawResult", ["unique_id", "start_logits", "end_logits"])
all_results = []
tic_eval = time.time()
for batch in data_loader:
input_ids, segment_ids, unipue_ids = batch
start_logits_tensor, end_logits_tensor = model(input_ids, segment_ids)
for idx in range(unipue_ids.shape[0]):
if len(all_results) % 1000 == 0 and len(all_results):
print("Processing example: %d" % len(all_results))
print('time per 1000:', time.time() - tic_eval)
tic_eval = time.time()
unique_id = int(unipue_ids[idx])
start_logits = [float(x) for x in start_logits_tensor.numpy()[idx]]
end_logits = [float(x) for x in end_logits_tensor.numpy()[idx]]
all_results.append(
RawResult(
unique_id=unique_id,
start_logits=start_logits,
end_logits=end_logits))
all_predictions, _, scores_diff_json = compute_predictions(
data_loader.dataset.examples, data_loader.dataset.features, all_results,
args.n_best_size, args.max_answer_length, args.do_lower_case, False,
0.0, args.verbose, tokenizer)
if do_pred:
with open('prediction.json', "w") as writer:
writer.write(
json.dumps(
all_predictions, ensure_ascii=False, indent=4) + "\n")
else:
squad_evaluate(data_loader.dataset.examples, all_predictions,
scores_diff_json, 1.0)
model.train()
def do_train(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
root = args.data_path
set_seed(args)
train_ds = ppnlp.datasets.DuReaderRobust(
tokenizer=tokenizer,
root=root,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
max_seq_length=args.max_seq_length,
segment='train')
train_batch_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=args.batch_size, shuffle=True)
train_batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment
Stack(), # unipue_id
Stack(dtype="int64"), # start_pos
Stack(dtype="int64") # end_pos
): [data for i, data in enumerate(fn(samples)) if i != 2]
train_data_loader = DataLoader(
dataset=train_ds,
batch_sampler=train_batch_sampler,
collate_fn=train_batchify_fn,
return_list=True)
dev_ds = ppnlp.datasets.DuReaderRobust(
tokenizer=tokenizer,
root=root,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
max_seq_length=args.max_seq_length,
segment='dev')
dev_batch_sampler = paddle.io.BatchSampler(
dev_ds, batch_size=args.batch_size, shuffle=False)
dev_batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment
Stack() # unipue_id
): fn(samples)
dev_data_loader = DataLoader(
dataset=dev_ds,
batch_sampler=dev_batch_sampler,
collate_fn=dev_batchify_fn,
return_list=True)
test_ds = ppnlp.datasets.DuReaderRobust(
tokenizer=tokenizer,
root=root,
doc_stride=args.doc_stride,
max_query_length=args.max_query_length,
max_seq_length=args.max_seq_length,
segment='test')
test_batch_sampler = paddle.io.BatchSampler(
test_ds, batch_size=args.batch_size, shuffle=False)
test_data_loader = DataLoader(
dataset=test_ds,
batch_sampler=test_batch_sampler,
collate_fn=dev_batchify_fn,
return_list=True)
model = model_class.from_pretrained(args.model_name_or_path)
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_ds.examples)//args.batch_size*args.num_train_epochs): float(
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
criterion = CrossEntropyLossForSQuAD()
global_step = 0
tic_train = time.time()
for epoch in range(args.num_train_epochs):
for step, batch in enumerate(train_data_loader):
global_step += 1
input_ids, segment_ids, start_positions, end_positions = batch
logits = model(input_ids=input_ids, token_type_ids=segment_ids)
loss = criterion(logits, (start_positions, end_positions))
if global_step % args.logging_steps == 0:
print(
"global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
% (global_step, epoch, step, loss,
args.logging_steps / (time.time() - tic_train)))
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_gradients()
if global_step % args.save_steps == 0:
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
output_dir = os.path.join(args.output_dir,
"model_%d" % global_step)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# need better way to get inner model of DataParallel
model_to_save = model._layers if isinstance(
model, paddle.DataParallel) else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print('Saving checkpoint to:', output_dir)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, dev_data_loader, args, tokenizer)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, test_data_loader, args, tokenizer, True)
if __name__ == "__main__":
args = parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
else:
do_train(args)
# 阅读理解 DuReader-yesno
# 简介
## 1. 任务说明
机器阅读理解评测中常用的F1、EM等指标虽然能够很好的衡量抽取式模型所预测的答案和真实答案的匹配程度,但在处理观点类问题时,该类指标难以衡量模型是否真正理解答案所代表的含义,例如答案中包含的观点极性。DuReader-yesno是一个以观点极性判断为目标任务的数据集,通过引入该数据集,可以弥补抽取类数据集的不足,从而更好地评价模型的自然语言理解能力。
**目前语言模型要求使用PaddlePaddle 2.0及以上版本或适当的develop版本。**
## 2. 数据集
该数据集的任务定义如下:
对于一个给定的问题q、一系列相关文档D=d1, d2, …, dn,以及人工抽取答案段落摘要a,要求参评系统自动对问题q、候选文档D以及答案段落摘要a进行分析,输出每个答案段落摘要所表述的是非观点极性。其中,极性分为三类 {Yes, No, Depends}。其中:
* Yes:肯定观点,肯定观点指的是答案给出了较为明确的肯定态度。有客观事实的从客观事实的角度出发,主观态度类的从答案的整体态度来判断。
* No:否定观点,否定观点通常指的是答案较为明确的给出了与问题相反的态度。
* Depends:无法确定/分情况,主要指的是事情本身存在多种情况,不同情况下对应的观点不一致;或者答案本身对问题表示不确定,要具体具体情况才能判断。
例如:
```text
{
"documents":[
{
"title":"香蕉能放冰箱吗 香蕉剥皮冷冻保存_健康贴士_保健_99健康网",
"paragraphs":[
"本文导读:............."
]
}
],
"yesno_answer":"No",
"question":"香蕉能放冰箱吗",
"answer":"香蕉不能放冰箱,香蕉如果放冰箱里,会更容易变坏,会发黑腐烂。",
"id":293
}
```
# 快速开始
## 1. 开始第一次模型调用
### 数据准备
为了方便开发者进行测试,我们内置了数据下载脚本,也可以通过`--data_path`传入本地数据集的位置,数据集需保证与DuReader-yesno数据集格式一致。
### Fine-tune
按如下方式启动 Fine-tuning:
```shell
python -u ./run_du.py \
--model_type bert \
--model_name_or_path bert-base-chinese \
--max_seq_length 384 \
--batch_size 12 \
--learning_rate 3e-5 \
--num_train_epochs 2 \
--logging_steps 1000 \
--save_steps 1000 \
--warmup_proportion 0.1 \
--weight_decay 0.01 \
--output_dir ./tmp/dureader-yesno/ \
--n_gpu 1 \
```
* `model_type`: 预训练模型的种类。如bert,ernie,roberta等。
* `model_name_or_path`: 预训练模型的具体名称。如bert-base-uncased,bert-large-cased等。或者是模型文件的本地路径。
* `output_dir`: 保存模型checkpoint的路径。
训练结束后模型会自动对结果进行评估,得到类似如下的输出:
```text
accu: 0.861040
```
评估结束后模型会自动对测试集进行预测,并将可提交的结果生成在`prediction.json`中。
**NOTE:** 如需恢复模型训练,则model_name_or_path只需指定到文件夹名即可。如`--model_name_or_path=./tmp/dureader-yesno/model_19000/`,程序会自动加载模型参数`/model_state.pdparams`,也会自动加载词表,模型config和tokenizer的config。
## 2. 目录结构
```text
.
├── README.md # 文档
├── run_du.py # 训练代码
├── args.py # 参数读取
```
# 其他
## 如何贡献代码
如果你可以修复某个issue或者增加一个新功能,欢迎给我们提交PR。如果对应的PR被接受了,我们将根据贡献的质量和难度进行打分(0-5分,越高越好)。如果你累计获得了10分,可以联系我们获得面试机会或者为你写推荐信。
import argparse
def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--data_path",
type=str,
default=None,
help="Directory of all the data for train, valid, test.")
parser.add_argument(
"--model_type",
default=None,
type=str,
required=True,
help="Type of pre-trained model.")
parser.add_argument(
"--model_name_or_path",
default=None,
type=str,
required=True,
help="Path to pre-trained model or shortcut name of model.")
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model predictions and checkpoints will be written."
)
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help="The maximum total input sequence length after tokenization. Sequences longer "
"than this will be truncated, sequences shorter will be padded.")
parser.add_argument(
"--batch_size",
default=8,
type=int,
help="Batch size per GPU/CPU for training.")
parser.add_argument(
"--learning_rate",
default=5e-5,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument(
"--weight_decay",
default=0.0,
type=float,
help="Weight decay if we apply some.")
parser.add_argument(
"--adam_epsilon",
default=1e-8,
type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument(
"--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
parser.add_argument(
"--num_train_epochs",
default=3,
type=int,
help="Total number of training epochs to perform.")
parser.add_argument(
"--max_steps",
default=-1,
type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs."
)
parser.add_argument(
"--warmup_proportion",
default=0.0,
type=float,
help="Proportion of training steps to perform linear learning rate warmup for."
)
parser.add_argument(
"--logging_steps",
type=int,
default=500,
help="Log every X updates steps.")
parser.add_argument(
"--save_steps",
type=int,
default=500,
help="Save checkpoint every X updates steps.")
parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument(
"--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
parser.add_argument(
"--do_lower_case",
action='store_false',
help="Whether to lower case the input text. Should be True for uncased models and False for cased models."
)
parser.add_argument(
"--verbose", action='store_true', help="Whether to output verbose log.")
args = parser.parse_args()
return args
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import os
import random
import time
from functools import partial
import numpy as np
import paddle
from paddle.io import DataLoader
from args import parse_args
import json
import paddlenlp as ppnlp
from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
MODEL_CLASSES = {"bert": (BertForSequenceClassification, BertTokenizer)}
def set_seed(args):
random.seed(args.seed)
np.random.seed(args.seed)
paddle.seed(args.seed)
def convert_example(example,
tokenizer,
label_list,
max_seq_length=512,
is_test=False):
"""convert a DuReaderYesNo example into necessary features"""
def _truncate_seqs(seqs, max_seq_length):
# Account for [CLS], [SEP], [SEP] with "- 3"
tokens_a, tokens_b = seqs
max_seq_length -= 3
while True: # truncate with longest_first strategy
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_seq_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
return seqs
def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
concat = sum((seq + sep for sep, seq in zip(separators, seqs)), [])
segment_ids = sum(
([i] * (len(seq) + len(sep))
for i, (sep, seq) in enumerate(zip(separators, seqs))), [])
if isinstance(seq_mask, int):
seq_mask = [[seq_mask] * len(seq) for seq in seqs]
if isinstance(separator_mask, int):
separator_mask = [[separator_mask] * len(sep) for sep in separators]
p_mask = sum((s_mask + mask
for sep, seq, s_mask, mask in zip(
separators, seqs, seq_mask, separator_mask)), [])
return concat, segment_ids, p_mask
if not is_test:
# `label_list == None` is for regression task
label_dtype = "int64" if label_list else "float32"
# get the label
label = example[-2]
example = example[:-2]
#create label maps if classification task
if label_list:
label_map = {}
for (i, l) in enumerate(label_list):
label_map[l] = i
label = label_map[label]
label = np.array([label], dtype=label_dtype)
else:
qas_id = example[-1]
example = example[:-2]
# tokenize raw text
tokens_raw = [tokenizer(l) for l in example]
# truncate to the truncate_length,
tokens_trun = _truncate_seqs(tokens_raw, max_seq_length)
# concate the sequences with special tokens
tokens_trun[0] = [tokenizer.cls_token] + tokens_trun[0]
tokens, segment_ids, _ = _concat_seqs(tokens_trun, [[tokenizer.sep_token]] *
len(tokens_trun))
# convert the token to ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
valid_length = len(input_ids)
if not is_test:
return input_ids, segment_ids, valid_length, label
else:
return input_ids, segment_ids, valid_length, qas_id
def evaluate(model, metric, data_loader, do_pred=False):
model.eval()
if not do_pred:
metric.reset()
for batch in data_loader:
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
correct = metric.compute(logits, labels)
metric.update(correct)
accu = metric.accumulate()
print("accu: %f" % (accu))
else:
res = {}
for batch in data_loader:
input_ids, segment_ids, qas_id = batch
logits = model(input_ids, segment_ids)
qas_id = qas_id.numpy()
preds = paddle.argmax(logits, axis=1).numpy()
for i in range(len(preds)):
res[str(qas_id[i])] = data_loader.dataset.get_labels()[preds[i]]
with open('prediction.json', "w") as writer:
writer.write(json.dumps(res, ensure_ascii=False, indent=4) + "\n")
model.train()
def do_train(args):
paddle.set_device("gpu" if args.n_gpu else "cpu")
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
set_seed(args)
train_ds, dev_ds, test_ds = ppnlp.datasets.DuReaderYesNo.get_datasets(
['train', 'dev', 'test'])
trans_func = partial(
convert_example,
tokenizer=tokenizer,
label_list=train_ds.get_labels(),
max_seq_length=args.max_seq_length)
train_ds = train_ds.apply(trans_func, lazy=True)
train_batch_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=args.batch_size, shuffle=True)
batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment
Stack(), # length
Stack(dtype="int64"), # start_pos
): [data for i, data in enumerate(fn(samples)) if i != 2]
train_data_loader = DataLoader(
dataset=train_ds,
batch_sampler=train_batch_sampler,
collate_fn=batchify_fn,
return_list=True)
dev_ds = dev_ds.apply(trans_func, lazy=True)
dev_batch_sampler = paddle.io.BatchSampler(
dev_ds, batch_size=args.batch_size, shuffle=False)
dev_data_loader = DataLoader(
dataset=dev_ds,
batch_sampler=dev_batch_sampler,
collate_fn=batchify_fn,
return_list=True)
test_trans_func = partial(
convert_example,
tokenizer=tokenizer,
label_list=train_ds.get_labels(),
max_seq_length=args.max_seq_length,
is_test=True)
test_ds = test_ds.apply(test_trans_func, lazy=True)
test_batch_sampler = paddle.io.BatchSampler(
test_ds, batch_size=args.batch_size, shuffle=False)
test_batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment
Stack() # length
): fn(samples)
test_data_loader = DataLoader(
dataset=test_ds,
batch_sampler=test_batch_sampler,
collate_fn=batchify_fn,
return_list=True)
model = model_class.from_pretrained(args.model_name_or_path, num_classes=3)
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, warmup_proportion=args.warmup_proportion,
num_training_steps=args.max_steps if args.max_steps > 0 else
(len(train_ds.examples)//args.batch_size*args.num_train_epochs): float(
current_step) / float(max(1, warmup_proportion*num_training_steps))
if current_step < warmup_proportion*num_training_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - warmup_proportion*num_training_steps))))
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
criterion = paddle.nn.loss.CrossEntropyLoss()
metric = paddle.metric.Accuracy()
global_step = 0
tic_train = time.time()
for epoch in range(args.num_train_epochs):
for step, batch in enumerate(train_data_loader):
global_step += 1
input_ids, segment_ids, label = batch
logits = model(input_ids=input_ids, token_type_ids=segment_ids)
loss = criterion(logits, label)
if global_step % args.logging_steps == 0:
print(
"global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s"
% (global_step, epoch, step, loss,
args.logging_steps / (time.time() - tic_train)))
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_gradients()
if global_step % args.save_steps == 0:
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
output_dir = os.path.join(args.output_dir,
"model_%d" % global_step)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# need better way to get inner model of DataParallel
model_to_save = model._layers if isinstance(
model, paddle.DataParallel) else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print('Saving checkpoint to:', output_dir)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, metric, dev_data_loader)
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, metric, test_data_loader, True)
if __name__ == "__main__":
args = parse_args()
if args.n_gpu > 1:
paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
else:
do_train(args)
......@@ -7,10 +7,10 @@ import warnings
from paddle.dataset.common import md5file
from paddle.utils.download import get_path_from_url
from paddlenlp.utils.env import DATA_HOME
from paddle.io import Dataset
from .squad import InputFeatures, SQuAD
__all__ = ['DuReader']
__all__ = ['DuReader', 'DuReaderYesNo', 'DuReaderRobust']
class DuReaderExample(object):
......@@ -166,3 +166,136 @@ class DuReader(SQuAD):
examples.append(example)
self.examples = examples[:2000]
class DuReaderRobust(SQuAD):
SEGMENT_INFO = collections.namedtuple('SEGMENT_INFO', ('file', 'md5'))
DATA_URL = 'https://dataset-bj.cdn.bcebos.com/qianyan/dureader_robust-data.tar.gz'
SEGMENTS = {
'train': SEGMENT_INFO(
os.path.join('dureader_robust-data', 'train.json'),
'800a3dcb742f9fdf9b11e0a83433d4be'),
'dev': SEGMENT_INFO(
os.path.join('dureader_robust-data', 'dev.json'),
'ae73cec081eaa28a735204c4898a2222'),
'test': SEGMENT_INFO(
os.path.join('dureader_robust-data', 'test.json'),
'e0e8aa5c7b6d11b6fc3935e29fc7746f')
}
def __init__(self,
tokenizer,
segment='train',
version_2_with_negative=True,
root=None,
doc_stride=128,
max_query_length=64,
max_seq_length=512,
**kwargs):
super(DuReaderRobust, self).__init__(
tokenizer=tokenizer,
segment=segment,
version_2_with_negative=False,
root=root,
doc_stride=doc_stride,
max_query_length=max_query_length,
max_seq_length=max_seq_length,
**kwargs)
def _get_data(self, root, segment, **kwargs):
default_root = os.path.join(DATA_HOME, 'DuReader')
filename, data_hash = self.SEGMENTS[segment]
fullname = os.path.join(default_root,
filename) if root is None else os.path.join(
os.path.expanduser(root), filename)
if not os.path.exists(fullname) or (data_hash and
not md5file(fullname) == data_hash):
if root is not None: # not specified, and no need to warn
warnings.warn(
'md5 check failed for {}, download {} data to {}'.format(
filename, self.__class__.__name__, default_root))
get_path_from_url(self.DATA_URL, default_root)
self.full_path = fullname
class DuReaderYesNo(Dataset):
SEGMENT_INFO = collections.namedtuple('SEGMENT_INFO', ('file', 'md5'))
DATA_URL = 'https://dataset-bj.cdn.bcebos.com/qianyan/dureader_yesno-data.tar.gz'
SEGMENTS = {
'train': SEGMENT_INFO(
os.path.join('dureader_yesno-data', 'train.json'),
'c469a0ef3f975cfd705e3553ddb27cc1'),
'dev': SEGMENT_INFO(
os.path.join('dureader_yesno-data', 'dev.json'),
'c38544f8b5a7b567492314e3232057b5'),
'test': SEGMENT_INFO(
os.path.join('dureader_yesno-data', 'test.json'),
'1c7a1a3ea5b8992eeaeea017fdc2d55f')
}
def __init__(self, segment='train', root=None, **kwargs):
self._get_data(root, segment, **kwargs)
self._transform_func = None
if segment == 'train':
self.is_training = True
else:
self.is_training = False
self._read()
def _get_data(self, root, segment, **kwargs):
default_root = os.path.join(DATA_HOME, 'DuReader')
filename, data_hash = self.SEGMENTS[segment]
fullname = os.path.join(default_root,
filename) if root is None else os.path.join(
os.path.expanduser(root), filename)
if not os.path.exists(fullname) or (data_hash and
not md5file(fullname) == data_hash):
if root is not None: # not specified, and no need to warn
warnings.warn(
'md5 check failed for {}, download {} data to {}'.format(
filename, self.__class__.__name__, default_root))
get_path_from_url(self.DATA_URL, default_root)
self.full_path = fullname
def _read(self):
data_lines = []
with open(self.full_path, "r", encoding="utf8") as reader:
data_lines += reader.readlines()
examples = []
for entry in data_lines:
source = json.loads(entry.strip())
examples.append([
source['question'], source['answer'], source['yesno_answer'],
source['id']
])
self.examples = examples
def __len__(self):
return len(self.examples)
def __getitem__(self, idx):
return self.examples[idx]
def get_labels(self):
"""
Return labels of the DuReaderYesNo sample.
"""
return ["Yes", "No", "Depends"]
......@@ -69,31 +69,27 @@ class SQuAD(Dataset):
SEGMENT_INFO = collections.namedtuple('SEGMENT_INFO', ('file', 'md5'))
DEV_DATA_URL_V2 = 'https://paddlenlp.bj.bcebos.com/datasets/squad/dev-v2.0.json'
DEV_DATA_MD5_V2 = '7ab59a1b04bd7cb773f98a0717106c9b'
TRAIN_DATA_URL_V2 = 'https://paddlenlp.bj.bcebos.com/datasets/squad/train-v2.0.json'
TRAIN_DATA_MD5_V2 = '793daf7b6224281e75fe61c1f80afe35'
DEV_DATA_URL_V1 = 'https://paddlenlp.bj.bcebos.com/datasets/squad/dev-v1.1.json'
DEV_DATA_MD5_V1 = '7ab59a1b04bd7cb773f98a0717106c9b'
TRAIN_DATA_URL_V1 = 'https://paddlenlp.bj.bcebos.com/datasets/squad/train-v1.1.json'
TRAIN_DATA_MD5_V1 = '793daf7b6224281e75fe61c1f80afe35'
SEGMENTS = {
'1.1': {
'train': SEGMENT_INFO(
os.path.join('v1', 'train.json'),
'dc2dac669a113866a6480a0b10cd50bf'),
os.path.join('v1', 'train-v1.1.json'),
'981b29407e0affa3b1b156f72073b945'),
'dev': SEGMENT_INFO(
os.path.join('v1', 'dev.json'),
'185958e46ba556b38c6a7cc63f3a2135')
os.path.join('v1', 'dev-v1.1.json'),
'3e85deb501d4e538b6bc56f786231552')
},
'2.0': {
'train': SEGMENT_INFO(
os.path.join('v2', 'train.json'),
'dc2dac669a113866a6480a0b10cd50bf'),
os.path.join('v2', 'train-v2.0.json'),
'62108c273c268d70893182d5cf8df740'),
'dev': SEGMENT_INFO(
os.path.join('v2', 'dev.json'),
'185958e46ba556b38c6a7cc63f3a2135')
os.path.join('v2', 'dev-v2.0.json'),
'246adae8b7002f8679c027697b0b7cf8')
}
}
......@@ -123,7 +119,7 @@ class SQuAD(Dataset):
self._read()
self.data = self.convert_examples_to_feature(
self.features = self.convert_examples_to_feature(
self.examples,
tokenizer=self.tokenizer,
doc_stride=self.doc_stride,
......@@ -425,7 +421,7 @@ class SQuAD(Dataset):
if self.version_2_with_negative:
is_impossible = qa["is_impossible"]
orig_answer_text = []
if not is_impossible:
if not is_impossible and 'answers' in qa.keys():
answers = qa["answers"]
for answer in answers:
orig_answer_text.append(answer["text"])
......@@ -446,10 +442,10 @@ class SQuAD(Dataset):
self.examples = examples
def __len__(self):
return len(self.data)
return len(self.features)
def __getitem__(self, idx):
feature = self.data[idx]
feature = self.features[idx]
if self.is_training:
return feature.input_ids, feature.segment_ids, feature.unique_id, feature.start_position, feature.end_position
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册