未验证 提交 f07cdf53 编写于 作者: J jeff41404 提交者: GitHub

add electra pretrain and modify style of electra modeling (#4990)

* add electra pretrain and modify style of electra modeling

* add electra pretrain, modify style of electra modeling and fix problems of review

* delete predict_classifer

* modify accu to acc

* add paddlenlp.metrics.glue
上级 8e45228d
......@@ -25,25 +25,27 @@ from functools import partial
import numpy as np
import paddle
from paddle.io import DataLoader
from paddle.metric import Metric, Accuracy, Precision, Recall
from paddlenlp.datasets import GlueCoLA, GlueSST2, GlueMRPC, GlueSTSB, GlueQQP, GlueMNLI, GlueQNLI, GlueRTE
from paddlenlp.data import Stack, Tuple, Pad
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.transformers import ElectraForSequenceClassification, ElectraTokenizer
from paddlenlp.metrics import AccuracyAndF1, Mcc, PearsonAndSpearman
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
TASK_CLASSES = {
"cola": (GlueCoLA, paddle.metric.Accuracy),
"sst-2": (GlueSST2, paddle.metric.Accuracy),
"mrpc": (GlueMRPC, paddle.metric.Accuracy),
"sts-b": (GlueSTSB, paddle.metric.Accuracy),
"qqp": (GlueQQP, paddle.metric.Accuracy),
"mnli": (GlueMNLI, paddle.metric.Accuracy),
"qnli": (GlueQNLI, paddle.metric.Accuracy),
"rte": (GlueRTE, paddle.metric.Accuracy),
"cola": (GlueCoLA, Mcc),
"sst-2": (GlueSST2, Accuracy),
"mrpc": (GlueMRPC, AccuracyAndF1),
"sts-b": (GlueSTSB, PearsonAndSpearman),
"qqp": (GlueQQP, AccuracyAndF1),
"mnli": (GlueMNLI, Accuracy),
"qnli": (GlueQNLI, Accuracy),
"rte": (GlueRTE, Accuracy),
}
MODEL_CLASSES = {
......@@ -57,21 +59,17 @@ def set_seed(args):
paddle.seed(args.seed + paddle.distributed.get_rank())
def evaluate(model, loss_fct, metric, data_loader, return_dict):
def evaluate(model, loss_fct, metric, data_loader):
model.eval()
metric.reset()
for batch in data_loader:
input_ids, segment_ids, labels = batch
model_output = model(input_ids=input_ids, token_type_ids=segment_ids)
if not return_dict:
logits = model_output[0]
else:
logits = model_output.logits
logits = model(input_ids=input_ids, token_type_ids=segment_ids)
loss = loss_fct(logits, labels)
correct = metric.compute(logits, labels)
metric.update(correct)
accu = metric.accumulate()
print("eval loss: %f, accu: %f, " % (loss.numpy(), accu), end='')
acc = metric.accumulate()
print("eval loss: %f, acc: %s, " % (loss.numpy(), acc), end='')
model.train()
......@@ -218,9 +216,10 @@ def do_train(args):
num_workers=0,
return_list=True)
num_labels = 1 if train_dataset.get_labels() == None else len(
train_dataset.get_labels())
model = model_class.from_pretrained(
args.model_name_or_path, num_labels=len(train_dataset.get_labels()))
return_dict = model.return_dict
args.model_name_or_path, num_labels=num_labels)
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
......@@ -267,14 +266,14 @@ def do_train(args):
tic_train = time.time()
for epoch in range(args.num_train_epochs):
for step, batch in enumerate(train_data_loader):
global_step += 1
input_ids, segment_ids, labels = batch
model_output = model(
input_ids=input_ids, token_type_ids=segment_ids)
if not return_dict:
logits = model_output[0]
else:
logits = model_output.logits
logits = model(input_ids=input_ids, token_type_ids=segment_ids)
loss = loss_fct(logits, labels)
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_gradients()
if global_step % args.logging_steps == 0:
print(
"global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
......@@ -282,21 +281,15 @@ def do_train(args):
paddle.distributed.get_rank(), loss, optimizer.get_lr(),
args.logging_steps / (time.time() - tic_train)))
tic_train = time.time()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_gradients()
if global_step > 1 and global_step % args.save_steps == 0:
if global_step % args.save_steps == 0:
tic_eval = time.time()
if args.task_name == "mnli":
evaluate(model, loss_fct, metric, dev_data_loader_matched,
return_dict)
evaluate(model, loss_fct, metric, dev_data_loader_matched)
evaluate(model, loss_fct, metric,
dev_data_loader_mismatched, return_dict)
dev_data_loader_mismatched)
print("eval done total : %s s" % (time.time() - tic_eval))
else:
evaluate(model, loss_fct, metric, dev_data_loader,
return_dict)
evaluate(model, loss_fct, metric, dev_data_loader)
print("eval done total : %s s" % (time.time() - tic_eval))
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
output_dir = os.path.join(args.output_dir,
......@@ -309,7 +302,6 @@ def do_train(args):
model, paddle.DataParallel) else model
model_to_save.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
global_step += 1
def get_md5sum(file_path):
......@@ -374,7 +366,7 @@ if __name__ == "__main__":
"than this will be truncated, sequences shorter will be padded.", )
parser.add_argument(
"--learning_rate",
default=3e-4,
default=1e-4,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument(
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import collections
import itertools
import logging
import os
import io
import random
import time
from functools import partial
from concurrent.futures import ThreadPoolExecutor
import numpy as np
import paddle
import paddle.distributed as dist
from paddle.io import DataLoader, Dataset
from paddlenlp.transformers import ElectraForTotalPretraining, ElectraModel, ElectraPretrainingCriterion
from paddlenlp.transformers import ElectraDiscriminator, ElectraGenerator
from paddlenlp.transformers import ElectraTokenizer
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
MODEL_CLASSES = {"electra": (ElectraForTotalPretraining, ElectraTokenizer), }
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument(
"--model_type",
default="electra",
type=str,
help="Model type selected in the list: " +
", ".join(MODEL_CLASSES.keys()), )
parser.add_argument(
"--model_name_or_path",
default="electra-small",
type=str,
help="Path to pre-trained model or shortcut name selected in the list: "
+ ", ".join(
sum([
list(classes[-1].pretrained_init_configuration.keys())
for classes in MODEL_CLASSES.values()
], [])), )
parser.add_argument(
"--input_dir",
default=None,
type=str,
required=True,
help="The input directory where the data will be read from.", )
parser.add_argument(
"--output_dir",
default=None,
type=str,
required=True,
help="The output directory where the model predictions and checkpoints will be written.",
)
parser.add_argument(
"--max_seq_length",
default=128,
type=int,
help="max length of each sequence")
parser.add_argument(
"--mask_prob",
default=0.15,
type=float,
help="the probability of one word to be mask")
parser.add_argument(
"--train_batch_size",
default=96,
type=int,
help="Batch size per GPU/CPU for training.", )
parser.add_argument(
"--eval_batch_size",
default=96,
type=int,
help="Batch size per GPU/CPU for training.", )
parser.add_argument(
"--learning_rate",
default=5e-4,
type=float,
help="The initial learning rate for Adam.")
parser.add_argument(
"--weight_decay",
default=0.01,
type=float,
help="Weight decay if we apply some.")
parser.add_argument(
"--adam_epsilon",
default=1e-6,
type=float,
help="Epsilon for Adam optimizer.")
parser.add_argument(
"--num_train_epochs",
default=4,
type=int,
help="Total number of training epochs to perform.", )
parser.add_argument(
"--max_steps",
default=-1,
type=int,
help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
)
parser.add_argument(
"--warmup_steps",
default=10000,
type=int,
help="Linear warmup over warmup_steps.")
parser.add_argument(
"--logging_steps",
type=int,
default=100,
help="Log every X updates steps.")
parser.add_argument(
"--save_steps",
type=int,
default=1000,
help="Save checkpoint every X updates steps.")
parser.add_argument(
"--init_from_ckpt",
type=bool,
default=False,
help="Whether to load model checkpoint. if True, args.model_name_or_path must be dir store ckpt"
)
parser.add_argument(
"--seed", type=int, default=42, help="random seed for initialization")
parser.add_argument(
"--eager_run", type=bool, default=True, help="Use dygraph mode.")
parser.add_argument(
"--n_gpu",
type=int,
default=1,
help="number of gpus to use, 0 for cpu.")
args = parser.parse_args()
return args
def set_seed(args):
random.seed(args.seed + paddle.distributed.get_rank())
np.random.seed(args.seed + paddle.distributed.get_rank())
paddle.seed(args.seed + paddle.distributed.get_rank())
class WorkerInitObj(object):
def __init__(self, seed):
self.seed = seed
def __call__(self, id):
np.random.seed(seed=self.seed + id)
random.seed(self.seed + id)
class BookCorpus(paddle.io.Dataset):
"""
https://web.eecs.umich.edu/~lahiri/gutenberg_dataset.html
Args:
data_path (:obj:`str`) : The dataset file path, which contains train.tsv, dev.tsv and test.tsv.
tokenizer (:obj:`class PretrainedTokenizer`) : The tokenizer to split word and convert word to id.
max_seq_length (:obj:`int`) : max length for each sequence.
mode (:obj:`str`, `optional`, defaults to `train`):
It identifies the dataset mode (train, test or dev).
"""
def __init__(
self,
data_path,
tokenizer,
max_seq_length,
mode='train', ):
if mode == 'train':
data_file = 'train.data'
elif mode == 'test':
data_file = 'test.data'
else:
data_file = 'dev.data'
self.data_file = os.path.join(data_path, data_file)
self.tokenizer = tokenizer
self.max_seq_length = max_seq_length
self.raw_examples = self._read_file(self.data_file)
def _read_file(self, input_file):
"""
Reads a text file.
Args:
input_file (:obj:`str`) : The file to be read.
Returns:
examples (:obj:`list`): All the input data.
"""
if not os.path.exists(input_file):
raise RuntimeError("The file {} is not found.".format(input_file))
else:
with io.open(input_file, "r", encoding="UTF-8") as f:
examples = []
for line in f.read().splitlines():
if (len(line) > 0 and not line.isspace()):
tokens = self.tokenizer(line)
ids = self.tokenizer.convert_tokens_to_ids(tokens)
example = self.truncation_ids(ids, self.max_seq_length)
examples.append(example)
return examples
def truncation_ids(self, ids, max_seq_length):
if len(ids) <= (max_seq_length - 2):
return ids
else:
return ids[:(max_seq_length - 2)]
def __getitem__(self, idx):
return self.raw_examples[idx]
def __len__(self):
return len(self.raw_examples)
class DataCollatorForElectra(object):
"""
pads, gets batch of tensors and preprocesses batches for masked language modeling
when dataloader num_worker > 0, this collator may trigger some bugs, for safe, be sure dataloader num_worker=0
"""
def __init__(self,
tokenizer,
max_seq_length,
mlm=True,
mlm_probability=0.15):
self.tokenizer = tokenizer
self.max_seq_length = max_seq_length
self.mlm = True
self.mlm_probability = mlm_probability
def __call__(self, examples):
if self.mlm:
inputs, raw_inputs, labels = self.mask_tokens(examples)
return inputs, raw_inputs, labels
else:
raw_inputs, _ = self.add_special_tokens_and_set_maskprob(
examples, True, self.max_seq_length)
raw_inputs = self.tensorize_batch(raw_inputs, "int64")
inputs = raw_inputs.clone().detach()
labels = raw_inputs.clone().detach()
if self.tokenizer.pad_token is not None:
pad_token_id = self.tokenizer.convert_tokens_to_ids(
self.tokenizer.pad_token)
labels[labels == pad_token_id] = -100
return batch, raw_inputs, labels
def tensorize_batch(self, examples, dtype):
if isinstance(examples[0], (list, tuple)):
examples = [paddle.to_tensor(e, dtype=dtype) for e in examples]
length_of_first = examples[0].shape[0]
are_tensors_same_length = all(x.shape[0] == length_of_first
for x in examples)
if are_tensors_same_length:
return paddle.stack(examples, axis=0)
else:
raise ValueError(
"the tensor in examples not have same shape, please check input examples"
)
def add_special_tokens_and_set_maskprob(self, inputs, truncation,
max_seq_length):
sep_token_id = self.tokenizer.convert_tokens_to_ids(
self.tokenizer.sep_token)
pad_token_id = self.tokenizer.convert_tokens_to_ids(
self.tokenizer.pad_token)
cls_token_id = self.tokenizer.convert_tokens_to_ids(
self.tokenizer.cls_token)
full_inputs = []
full_maskprob = []
max_length = 0
for ids in inputs:
if len(ids) > max_length:
max_length = len(ids)
max_length = min(max_length + 2, max_seq_length)
for ids in inputs:
if len(ids) <= (max_length - 2):
padding_num = max_length - len(ids) - 2
full_inputs.append([cls_token_id] + ids + [sep_token_id] + (
[pad_token_id] * padding_num))
full_maskprob.append([0] + ([self.mlm_probability] * len(ids)) +
[0] + ([0] * padding_num))
else:
if truncation:
full_inputs.append([cls_token_id] + ids[:(max_length - 2)] +
[sep_token_id])
full_maskprob.append([0] + ([self.mlm_probability] * (
max_length - 2)) + [0])
else:
full_inputs.append([cls_token_id] + ids + [sep_token_id])
full_maskprob.append([0] + ([self.mlm_probability] * len(
ids)) + [0])
return full_inputs, full_maskprob
def mask_tokens(self, examples):
if self.tokenizer.mask_token is None:
raise ValueError(
"the tokenizer does not have mask_token, please check!")
mask_token_id = self.tokenizer.convert_tokens_to_ids(
self.tokenizer.mask_token)
raw_inputs, probability_matrix = self.add_special_tokens_and_set_maskprob(
examples, True, self.max_seq_length)
raw_inputs = self.tensorize_batch(raw_inputs, "int64")
probability_matrix = self.tensorize_batch(probability_matrix, "float32")
inputs = raw_inputs.clone()
labels = raw_inputs.clone()
total_indices = paddle.bernoulli(probability_matrix).astype("bool")
unuse_labels = paddle.full(labels.shape, -100).astype("int64")
labels = paddle.where(total_indices, labels, unuse_labels)
# 80% MASK
indices_mask = paddle.bernoulli(paddle.full(labels.shape, 0.8)).astype(
"bool").logical_and(total_indices)
masked_inputs = paddle.full(inputs.shape, mask_token_id).astype("int64")
inputs = paddle.where(indices_mask, masked_inputs, inputs)
# 10% Random
indices_random = paddle.bernoulli(paddle.full(
labels.shape, 0.5)).astype("bool").logical_and(
total_indices).logical_and(indices_mask.logical_not())
random_words = paddle.randint(
low=0,
high=self.tokenizer.vocab_size,
shape=labels.shape,
dtype="int64")
inputs = paddle.where(indices_random, random_words, inputs)
# 10% Original
return inputs, raw_inputs, labels
def create_dataloader(dataset,
mode='train',
batch_size=1,
use_gpu=True,
data_collator=None):
"""
Creats dataloader.
Args:
dataset(obj:`paddle.io.Dataset`):
Dataset instance.
mode(obj:`str`, optional, defaults to obj:`train`):
If mode is 'train', it will shuffle the dataset randomly.
batch_size(obj:`int`, optional, defaults to 1):
The sample number of a mini-batch.
use_gpu(obj:`bool`, optional, defaults to obj:`True`):
Whether to use gpu to run.
Returns:
dataloader(obj:`paddle.io.DataLoader`): The dataloader which generates batches.
"""
#print("%s.data has batch_size : %s" % (mode, batch_size))
if mode == 'train' and use_gpu:
sampler = paddle.io.DistributedBatchSampler(
dataset=dataset, batch_size=batch_size, shuffle=True)
dataloader = paddle.io.DataLoader(
dataset,
batch_sampler=sampler,
return_list=True,
collate_fn=data_collator,
num_workers=0)
else:
shuffle = True if mode == 'train' else False
sampler = paddle.io.BatchSampler(
dataset=dataset, batch_size=batch_size, shuffle=shuffle)
dataloader = paddle.io.DataLoader(
dataset,
batch_sampler=sampler,
return_list=True,
collate_fn=data_collator,
num_workers=0)
return dataloader
def do_train(args):
paddle.enable_static() if not args.eager_run else None
paddle.set_device("gpu" if args.n_gpu else "cpu")
if paddle.distributed.get_world_size() > 1:
paddle.distributed.init_parallel_env()
set_seed(args)
worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
# Loads or initializes a model.
pretrained_models = list(tokenizer_class.pretrained_init_configuration.keys(
))
if args.model_name_or_path in pretrained_models:
tokenizer = tokenizer_class.from_pretrained("./")
#tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
generator = ElectraGenerator(
ElectraModel(**ElectraModel.pretrained_init_configuration[
args.model_name_or_path + "-generator"]))
discriminator = ElectraDiscriminator(
ElectraModel(**ElectraModel.pretrained_init_configuration[
args.model_name_or_path + "-discriminator"]))
model = model_class(generator, discriminator)
else:
if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt:
# load checkpoint
tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
for file_id, file_name in model_class.resource_files_names.items():
full_file_name = os.path.join(args.model_name_or_path,
file_name)
# to be write : load model ckpt file
else:
raise ValueError("initialize a model need identifier or the "
"path to a directory instead. The supported model "
"identifiers are as follows: {}".format(
model_class.pretrained_init_configuration.keys(
)))
criterion = ElectraPretrainingCriterion(
getattr(model.generator,
ElectraGenerator.base_model_prefix).config["vocab_size"],
model.gen_weight, model.disc_weight)
if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)
# Loads dataset.
tic_load_data = time.time()
print("start load data : %s" %
(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
train_dataset = BookCorpus(
data_path=args.input_dir,
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
mode='train')
print("load data done, total : %s s" % (time.time() - tic_load_data))
# Reads data and generates mini-batches.
data_collator = DataCollatorForElectra(
tokenizer=tokenizer,
max_seq_length=args.max_seq_length,
mlm=True,
mlm_probability=args.mask_prob)
train_data_loader = create_dataloader(
train_dataset,
batch_size=args.train_batch_size,
mode='train',
use_gpu=True if args.n_gpu else False,
data_collator=data_collator)
num_training_steps = args.max_steps if args.max_steps > 0 else (
len(train_data_loader) * args.num_train_epochs)
lr_scheduler = paddle.optimizer.lr.LambdaDecay(
args.learning_rate,
lambda current_step, num_warmup_steps=args.warmup_steps,
num_training_steps=num_training_steps: float(
current_step) / float(max(1, num_warmup_steps))
if current_step < num_warmup_steps else max(
0.0,
float(num_training_steps - current_step) / float(
max(1, num_training_steps - num_warmup_steps))))
optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
parameters=model.parameters(),
weight_decay=args.weight_decay,
apply_decay_param_fun=lambda x: x in [
p.name for n, p in model.named_parameters()
if not any(nd in n for nd in ["bias", "norm"])
])
print("start train : %s" %
(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
global_step = 0
tic_train = time.time()
for epoch in range(args.num_train_epochs):
for step, batch in enumerate(train_data_loader):
global_step += 1
input_ids, raw_input_ids, gen_labels = batch
gen_logits, disc_logits, disc_labels = model(
input_ids=input_ids,
raw_input_ids=raw_input_ids,
gen_labels=gen_labels)
loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels)
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.clear_gradients()
#print("backward done, total %s s" % (time.time() - tic_train))
#tic_train = time.time()
if global_step % args.logging_steps == 0:
print(
"global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
% (global_step, num_training_steps, epoch, step,
paddle.distributed.get_rank(), loss, optimizer.get_lr(),
args.logging_steps / (time.time() - tic_train)))
tic_train = time.time()
if global_step % args.save_steps == 0:
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
output_dir = os.path.join(args.output_dir,
"model_%d.pdparams" % global_step)
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# need better way to get inner model of DataParallel
model_to_save = model._layers if isinstance(
model, paddle.DataParallel) else model
#model_to_save.save_pretrained(output_dir)
paddle.save(model.state_dict(),
os.path.join(output_dir,
"model_state.pdparams"))
tokenizer.save_pretrained(output_dir)
paddle.save(optimizer.state_dict(),
os.path.join(output_dir, "model_state.pdopt"))
def print_arguments(args):
"""print arguments"""
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).items()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
if __name__ == "__main__":
args = parse_args()
print_arguments(args)
if args.n_gpu > 1:
paddle.distributed.spawn(do_train, args=(args, ), nprocs=args.n_gpu)
else:
do_train(args)
......@@ -14,4 +14,5 @@
from .perplexity import Perplexity
from .chunk import ChunkEvaluator
from .bleu import BLEU
\ No newline at end of file
from .bleu import BLEU
from .glue import AccuracyAndF1, Mcc, PearsonAndSpearman
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import math
from functools import partial
import numpy as np
import paddle
from paddle.metric import Metric, Accuracy, Precision, Recall
__all__ = ['AccuracyAndF1', 'Mcc', 'PearsonAndSpearman']
class AccuracyAndF1(Metric):
"""
Encapsulates Accuracy, Precision, Recall and F1 metric logic.
"""
def __init__(self,
topk=(1, ),
pos_label=1,
name='acc_and_f1',
*args,
**kwargs):
super(AccuracyAndF1, self).__init__(*args, **kwargs)
self.topk = topk
self.pos_label = pos_label
self._name = name
self.acc = Accuracy(self.topk, *args, **kwargs)
self.precision = Precision(*args, **kwargs)
self.recall = Recall(*args, **kwargs)
self.reset()
def compute(self, pred, label, *args):
self.label = label
self.preds_pos = paddle.nn.functional.softmax(pred)[:, self.pos_label]
return self.acc.compute(pred, label)
def update(self, correct, *args):
self.acc.update(correct)
self.precision.update(self.preds_pos, self.label)
self.recall.update(self.preds_pos, self.label)
def accumulate(self):
acc = self.acc.accumulate()
precision = self.precision.accumulate()
recall = self.recall.accumulate()
if precision == 0.0 or recall == 0.0:
f1 = 0.0
else:
# 1/f1 = 1/2 * (1/precision + 1/recall)
f1 = (2 * precision * recall) / (precision + recall)
return (
acc,
precision,
recall,
f1,
(acc + f1) / 2, )
def reset(self):
self.acc.reset()
self.precision.reset()
self.recall.reset()
self.label = None
self.preds_pos = None
def name(self):
"""
Return name of metric instance.
"""
return self._name
class Mcc(Metric):
"""
Matthews correlation coefficient
https://en.wikipedia.org/wiki/Matthews_correlation_coefficient.
"""
def __init__(self, name='mcc', *args, **kwargs):
super(Mcc, self).__init__(*args, **kwargs)
self._name = name
self.tp = 0 # true positive
self.fp = 0 # false positive
self.tn = 0 # true negative
self.fn = 0 # false negative
def compute(self, pred, label, *args):
preds = paddle.argsort(pred, descending=True)[:, :1]
return (preds, label)
def update(self, preds_and_labels):
preds = preds_and_labels[0]
preds = preds.numpy()
labels = preds_and_labels[1]
labels = labels.numpy().reshape(-1, 1)
sample_num = labels.shape[0]
for i in range(sample_num):
pred = preds[i]
label = labels[i]
if pred == 1:
if pred == label:
self.tp += 1
else:
self.fp += 1
else:
if pred == label:
self.tn += 1
else:
self.fn += 1
def accumulate(self):
if self.tp == 0 or self.fp == 0 or self.tn == 0 or self.fn == 0:
mcc = 0.0
else:
# mcc = (tp*tn-fp*fn)/ sqrt(tp+fp)(tp+fn)(tn+fp)(tn+fn))
mcc = (self.tp * self.tn - self.fp * self.fn) / math.sqrt(
(self.tp + self.fp) * (self.tp + self.fn) *
(self.tn + self.fp) * (self.tn + self.fn))
return (mcc, )
def reset(self):
self.tp = 0 # true positive
self.fp = 0 # false positive
self.tn = 0 # true negative
self.fn = 0 # false negative
def name(self):
"""
Return name of metric instance.
"""
return self._name
class PearsonAndSpearman(Metric):
"""
Pearson correlation coefficient
https://en.wikipedia.org/wiki/Pearson_correlation_coefficient
Spearman's rank correlation coefficient
https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient.
"""
def __init__(self, name='mcc', *args, **kwargs):
super(PearsonAndSpearman, self).__init__(*args, **kwargs)
self._name = name
self.preds = []
self.labels = []
def update(self, preds_and_labels):
preds = preds_and_labels[0]
preds = np.squeeze(preds.numpy().reshape(-1, 1)).tolist()
labels = preds_and_labels[1]
labels = np.squeeze(labels.numpy().reshape(-1, 1)).tolist()
self.preds.append(preds)
self.labels.append(labels)
def accumulate(self):
preds = [item for sublist in self.preds for item in sublist]
labels = [item for sublist in self.labels for item in sublist]
#import pdb; pdb.set_trace()
pearson = self.pearson(preds, labels)
spearman = self.spearman(preds, labels)
return (
pearson,
spearman,
(pearson + spearman) / 2, )
def pearson(self, preds, labels):
n = len(preds)
#simple sums
sum1 = sum(float(preds[i]) for i in range(n))
sum2 = sum(float(labels[i]) for i in range(n))
#sum up the squares
sum1_pow = sum([pow(v, 2.0) for v in preds])
sum2_pow = sum([pow(v, 2.0) for v in labels])
#sum up the products
p_sum = sum([preds[i] * labels[i] for i in range(n)])
numerator = p_sum - (sum1 * sum2 / n)
denominator = math.sqrt(
(sum1_pow - pow(sum1, 2) / n) * (sum2_pow - pow(sum2, 2) / n))
if denominator == 0:
return 0.0
return numerator / denominator
def spearman(self, preds, labels):
preds_rank = self.get_rank(preds)
labels_rank = self.get_rank(labels)
total = 0
n = len(preds)
for i in range(n):
total += pow((preds_rank[i] - labels_rank[i]), 2)
spearman = 1 - float(6 * total) / (n * (pow(n, 2) - 1))
return spearman
def get_rank(self, raw_list):
x = np.array(raw_list)
r_x = np.empty(x.shape, dtype=int)
y = np.argsort(-x)
for i, k in enumerate(y):
r_x[k] = i + 1
return r_x
def reset(self):
self.preds = []
self.labels = []
def name(self):
"""
Return name of metric instance.
"""
return self._name
......@@ -13,7 +13,6 @@
# limitations under the License.
import os
import time
from dataclasses import dataclass
from typing import Optional, Tuple
from collections import OrderedDict
......@@ -25,14 +24,10 @@ import paddle.nn.functional as F
from .. import PretrainedModel, register_base_model
__all__ = [
'ElectraModel',
'ElectraForTotalPretraining',
'ElectraForPretraining',
'ElectraForMaskedLM',
'ElectraClassificationHead',
'ElectraForSequenceClassification',
'ElectraForTokenClassification',
'ElectraModelOutput',
'ElectraModel', 'ElectraForTotalPretraining', 'ElectraDiscriminator',
'ElectraGenerator', 'ElectraClassificationHead',
'ElectraForSequenceClassification', 'ElectraForTokenClassification',
'ElectraPretrainingCriterion'
]
......@@ -70,13 +65,8 @@ ACT2FN = {
class ElectraEmbeddings(nn.Layer):
"""Construct the embeddings from word, position and token_type embeddings."""
def __init__(self,
vocab_size,
embedding_size,
hidden_dropout_prob,
max_position_embeddings,
type_vocab_size,
layer_norm_eps=1e-12):
def __init__(self, vocab_size, embedding_size, hidden_dropout_prob,
max_position_embeddings, type_vocab_size):
super(ElectraEmbeddings, self).__init__()
self.word_embeddings = nn.Embedding(vocab_size, embedding_size)
self.position_embeddings = nn.Embedding(max_position_embeddings,
......@@ -84,33 +74,24 @@ class ElectraEmbeddings(nn.Layer):
self.token_type_embeddings = nn.Embedding(type_vocab_size,
embedding_size)
self.layer_norm = nn.LayerNorm(embedding_size, epsilon=layer_norm_eps)
self.layer_norm = nn.LayerNorm(embedding_size, epsilon=1e-12)
self.dropout = nn.Dropout(hidden_dropout_prob)
def forward(self,
input_ids=None,
token_type_ids=None,
position_ids=None,
inputs_embeds=None):
if input_ids is not None:
input_shape = input_ids.shape
else:
input_shape = inputs_embeds.shape[:-1]
seq_length = input_shape[1]
def forward(self, input_ids, token_type_ids=None, position_ids=None):
if position_ids is None:
position_ids = paddle.arange(0, seq_length, dtype="int64")
ones = paddle.ones_like(input_ids, dtype="int64")
seq_length = paddle.cumsum(ones, axis=1)
position_ids = seq_length - ones
position_ids.stop_gradient = True
if token_type_ids is None:
token_type_ids = paddle.zeros_like(input_ids, dtype="int64")
if inputs_embeds is None:
inputs_embeds = self.word_embeddings(input_ids)
input_embeddings = self.word_embeddings(input_ids)
position_embeddings = self.position_embeddings(position_ids)
token_type_embeddings = self.token_type_embeddings(token_type_ids)
embeddings = inputs_embeds + position_embeddings + token_type_embeddings
embeddings = input_embeddings + position_embeddings + token_type_embeddings
embeddings = self.layer_norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings
......@@ -140,14 +121,14 @@ class ElectraGeneratorPredictions(nn.Layer):
def __init__(self, embedding_size, hidden_size, hidden_act):
super(ElectraGeneratorPredictions, self).__init__()
self.LayerNorm = nn.LayerNorm(embedding_size)
self.layer_norm = nn.LayerNorm(embedding_size)
self.dense = nn.Linear(hidden_size, embedding_size)
self.act = get_activation(hidden_act)
def forward(self, generator_hidden_states):
hidden_states = self.dense(generator_hidden_states)
hidden_states = self.act(hidden_states)
hidden_states = self.LayerNorm(hidden_states)
hidden_states = self.layer_norm(hidden_states)
return hidden_states
......@@ -165,16 +146,11 @@ class ElectraPretrainedModel(PretrainedModel):
disc_weight = 50.0
tie_word_embeddings = True
untied_generator_embeddings = False
untied_generator = True
output_hidden_states = False
output_attentions = False
return_dict = False
use_softmax_sample = True
# model init configuration
pretrained_init_configuration = {
"electra-small-generator": {
"architectures": ["ElectraForMaskedLM"],
"attention_probs_dropout_prob": 0.1,
"embedding_size": 128,
"hidden_act": "gelu",
......@@ -182,9 +158,7 @@ class ElectraPretrainedModel(PretrainedModel):
"hidden_size": 256,
"initializer_range": 0.02,
"intermediate_size": 1024,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "electra",
"num_attention_heads": 4,
"num_hidden_layers": 12,
"pad_token_id": 0,
......@@ -192,7 +166,6 @@ class ElectraPretrainedModel(PretrainedModel):
"vocab_size": 30522
},
"electra-base-generator": {
"architectures": ["ElectraForMaskedLM"],
"attention_probs_dropout_prob": 0.1,
"embedding_size": 768,
"hidden_act": "gelu",
......@@ -200,9 +173,7 @@ class ElectraPretrainedModel(PretrainedModel):
"hidden_size": 256,
"initializer_range": 0.02,
"intermediate_size": 1024,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "electra",
"num_attention_heads": 4,
"num_hidden_layers": 12,
"pad_token_id": 0,
......@@ -210,7 +181,6 @@ class ElectraPretrainedModel(PretrainedModel):
"vocab_size": 30522
},
"electra-large-generator": {
"architectures": ["ElectraForMaskedLM"],
"attention_probs_dropout_prob": 0.1,
"embedding_size": 1024,
"hidden_act": "gelu",
......@@ -218,9 +188,7 @@ class ElectraPretrainedModel(PretrainedModel):
"hidden_size": 256,
"initializer_range": 0.02,
"intermediate_size": 1024,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "electra",
"num_attention_heads": 4,
"num_hidden_layers": 24,
"pad_token_id": 0,
......@@ -228,7 +196,6 @@ class ElectraPretrainedModel(PretrainedModel):
"vocab_size": 30522
},
"electra-small-discriminator": {
"architectures": ["ElectraForPretraining"],
"attention_probs_dropout_prob": 0.1,
"embedding_size": 128,
"hidden_act": "gelu",
......@@ -236,9 +203,7 @@ class ElectraPretrainedModel(PretrainedModel):
"hidden_size": 256,
"initializer_range": 0.02,
"intermediate_size": 1024,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "electra",
"num_attention_heads": 4,
"num_hidden_layers": 12,
"pad_token_id": 0,
......@@ -246,7 +211,6 @@ class ElectraPretrainedModel(PretrainedModel):
"vocab_size": 30522
},
"electra-base-discriminator": {
"architectures": ["ElectraForPretraining"],
"attention_probs_dropout_prob": 0.1,
"embedding_size": 768,
"hidden_act": "gelu",
......@@ -254,9 +218,7 @@ class ElectraPretrainedModel(PretrainedModel):
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "electra",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
......@@ -264,7 +226,6 @@ class ElectraPretrainedModel(PretrainedModel):
"vocab_size": 30522
},
"electra-large-discriminator": {
"architectures": ["ElectraForPretraining"],
"attention_probs_dropout_prob": 0.1,
"embedding_size": 1024,
"hidden_act": "gelu",
......@@ -272,9 +233,7 @@ class ElectraPretrainedModel(PretrainedModel):
"hidden_size": 1024,
"initializer_range": 0.02,
"intermediate_size": 4096,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "electra",
"num_attention_heads": 16,
"num_hidden_layers": 24,
"pad_token_id": 0,
......@@ -282,7 +241,6 @@ class ElectraPretrainedModel(PretrainedModel):
"vocab_size": 30522
},
"chinese-electra-discriminator-small": {
"architectures": ["ElectraForPretraining"],
"attention_probs_dropout_prob": 0.1,
"embedding_size": 128,
"hidden_act": "gelu",
......@@ -290,9 +248,7 @@ class ElectraPretrainedModel(PretrainedModel):
"hidden_size": 256,
"initializer_range": 0.02,
"intermediate_size": 1024,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "electra",
"num_attention_heads": 4,
"num_hidden_layers": 12,
"pad_token_id": 0,
......@@ -300,7 +256,6 @@ class ElectraPretrainedModel(PretrainedModel):
"vocab_size": 21128,
},
"chinese-electra-discriminator-base": {
"architectures": ["ElectraForPretraining"],
"attention_probs_dropout_prob": 0.1,
"embedding_size": 768,
"hidden_act": "gelu",
......@@ -308,9 +263,7 @@ class ElectraPretrainedModel(PretrainedModel):
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "electra",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
......@@ -399,177 +352,20 @@ class ElectraPretrainedModel(PretrainedModel):
output_embeddings.weight.shape[
-1], output_embeddings.bias.shape[0]))
def get_extended_attention_mask(self, attention_mask, input_shape, place):
"""
Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
Arguments:
attention_mask (:obj:`paddle.Tensor`):
Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
input_shape (:obj:`Tuple[int]`):
The shape of the input to the model.
place: (:obj:`paddle.Tensor.place`):
The place of the input to the model.
Returns:
:obj:`paddle.Tensor` The extended attention mask, with a the same dtype as :obj:`attention_mask.dtype`.
"""
# We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
# ourselves in which case we just need to make it broadcastable to all heads.
if attention_mask.dim() == 3:
#extended_attention_mask = attention_mask[:, None, :, :]
extended_attention_mask = attention_mask.unsqueeze(1)
elif attention_mask.dim() == 2:
# Provided a padding mask of dimensions [batch_size, seq_length]
extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
else:
raise ValueError(
"Wrong shape for input_ids (shape {}) or attention_mask (shape {})".
format(input_shape, attention_mask.shape))
# Since attention_mask is 1.0 for positions we want to attend and 0.0 for
# masked positions, this operation will create a tensor which is 0.0 for
# positions we want to attend and -10000.0 for masked positions.
# Since we are adding it to the raw scores before the softmax, this is
# effectively the same as removing these entirely.
#extended_attention_mask = extended_attention_mask.to(dtype=self.dtype) # fp16 compatibility
extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
return extended_attention_mask
def get_head_mask(self,
head_mask,
num_hidden_layers,
is_attention_chunked=False):
"""
Prepare the head mask if needed.
Args:
head_mask (:obj:`paddle.Tensor` with shape :obj:`[num_heads]`
or :obj:`[num_hidden_layers x num_heads]`, `optional`):
The mask indicating if we should keep the heads or not (1.0 for keep, 0.0 for discard).
num_hidden_layers (:obj:`int`):
The number of hidden layers in the model.
is_attention_chunked: (:obj:`bool`, `optional, defaults to :obj:`False`):
Whether or not the attentions scores are computed by chunks or not.
Returns:
:obj:`paddle.Tensor` with shape :obj:`[num_hidden_layers x batch x num_heads x seq_length x seq_length]
or list with :obj:`[None]` for each layer.
"""
if head_mask is not None:
head_mask = self._convert_head_mask_to_5d(head_mask,
num_hidden_layers)
if is_attention_chunked is True:
head_mask = head_mask.unsqueeze(-1)
else:
head_mask = [None] * num_hidden_layers
return head_mask
def _convert_head_mask_to_5d(self, head_mask, num_hidden_layers):
"""-> [num_hidden_layers x batch x num_heads x seq_length x seq_length]"""
if head_mask.dim() == 1:
head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(
-1).unsqueeze(-1)
head_mask = paddle.expand(head_mask,
[num_hidden_layers, -1, -1, -1, -1])
elif head_mask.dim() == 2:
# We can specify head_mask for each layer
head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)
assert (head_mask.dim() == 5
), "head_mask.dim != 5, instead {head_mask.dim()}"
#head_mask = head_mask.to(dtype=self.dtype) # switch to float if need + fp16 compatibility
return head_mask
@dataclass
class ElectraModelOutput(OrderedDict):
"""
Output type of :class:`ElectraPretrainedModel`.
Args:
loss (`optional`, returned when ``labels`` is provided, ``paddle.Tensor`` of shape :obj:`(1,)`):
Total loss of the ELECTRA objective.
logits (:obj:`paddle.Tensor` dtype=float32 of shape :obj:`(batch_size, sequence_length)`):
Prediction scores of the head (scores for each token before SoftMax).
hidden_states (:obj:`tuple(paddle.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``output_hidden_states=True``):
Tuple of :obj:`paddle.Tensor` dtype=float32
(one for the output of the embeddings + one for the output of each layer)
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
attentions (:obj:`tuple(paddle.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``output_attentions=True``):
Tuple of :obj:`paddle.Tensor` dtype=float32 (one for each layer) of shape :obj:`(batch_size, num_heads,
sequence_length, sequence_length)`.
Attentions weights after the attention softmax,
used to compute the weighted average in the self-attention heads.
"""
loss = None
logits = None
hidden_states = None
attentions = None
ELECTRA_START_DOCSTRING = r"""
This model inherits from :class:`ElectraPretrainedModel`. Check the superclass documentation for the generic
methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
pruning heads etc.)
This model is also a Paddle `paddle.nn.Layer <https://www.paddlepaddle.org.cn/documentation/docs/zh/develop/api/paddle/fluid/dygraph/layers/Layer_cn.html#layer>`__
subclass. Use it as a regular Paddle Module and refer to the Padddle documentation for all matter related to
general usage and behavior.
Parameters:
"""
ELECTRA_INPUTS_DOCSTRING = r"""
Args:
input_ids (:obj:`paddle.Tensor` of shape :obj:`({0})`):
Indices of input sequence tokens in the vocabulary.
attention_mask (:obj:`paddle.Tensor` dtype=float32 of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
token_type_ids (:obj:`paddle.Tensor` dtype=int64 of shape :obj:`({0})`, `optional`):
Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
1]``:
- 0 corresponds to a `sentence A` token,
- 1 corresponds to a `sentence B` token.
position_ids (:obj:`paddle.Tensor` dtype=int64 of shape :obj:`({0})`, `optional`):
Indices of positions of each input sequence tokens in the position embeddings.
Selected in the range ``[0, max_position_embeddings - 1]``.
head_mask (:obj:`paddle.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
inputs_embeds (:obj:`paddle.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
vectors than the model's internal embedding lookup matrix.
encoder_hidden_states (:obj:`paddle.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
the model is configured as a decoder.
encoder_attention_mask (:obj:`paddle.Tensor` of shape :obj:`({0})`, `optional`):
Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
the cross-attention if the model is configured as a decoder. Mask values selected in ``[0, 1]``:
- 1 indicates the head is **not masked**,
- 0 indicates the head is **masked**.
output_attentions (:obj:`bool`, `optional`):
Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
tensors for more detail.
output_hidden_states (:obj:`bool`, `optional`):
Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
more detail.
return_dict (:obj:`bool`, `optional`):
Whether or not to return a :class:`ElectraModelOutput` instead of a plain tuple.
"""
@register_base_model
class ElectraModel(ElectraPretrainedModel):
def __init__(self, vocab_size, embedding_size, hidden_size,
num_hidden_layers, num_attention_heads, intermediate_size,
hidden_act, hidden_dropout_prob, attention_probs_dropout_prob,
max_position_embeddings, type_vocab_size, layer_norm_eps,
pad_token_id, initializer_range, model_type, architectures):
max_position_embeddings, type_vocab_size, initializer_range,
pad_token_id):
super(ElectraModel, self).__init__()
self.pad_token_id = pad_token_id
self.initializer_range = initializer_range
self.embeddings = ElectraEmbeddings(
vocab_size, embedding_size, hidden_dropout_prob,
max_position_embeddings, type_vocab_size, layer_norm_eps)
max_position_embeddings, type_vocab_size)
if embedding_size != hidden_size:
self.embeddings_project = nn.Linear(embedding_size, hidden_size)
......@@ -592,70 +388,33 @@ class ElectraModel(ElectraPretrainedModel):
def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None, ):
output_attentions = output_attentions if output_attentions is not None else self.output_attentions
output_hidden_states = (output_hidden_states
if output_hidden_states is not None else
self.output_hidden_states)
return_dict = return_dict if return_dict is not None else self.return_dict
if input_ids is not None and inputs_embeds is not None:
raise ValueError(
"You cannot specify both input_ids and inputs_embeds at the same time"
)
elif input_ids is not None:
input_shape = input_ids.shape
elif inputs_embeds is not None:
input_shape = inputs_embeds.shape[:-1]
else:
raise ValueError(
"You have to specify either input_ids or inputs_embeds")
place = input_ids.place if input_ids is not None else inputs_embeds.place
def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
if attention_mask is None:
attention_mask = paddle.ones(input_shape)
if token_type_ids is None:
token_type_ids = paddle.zeros(input_shape, dtype="int64")
extended_attention_mask = self.get_extended_attention_mask(
attention_mask, input_shape, place)
#head_mask = self.get_head_mask(head_mask, self.num_hidden_layers)
attention_mask = paddle.unsqueeze(
(input_ids == self.pad_token_id).astype("float32") * -1e9,
axis=[1, 2])
hidden_states = self.embeddings(
embedding_output = self.embeddings(
input_ids=input_ids,
position_ids=position_ids,
token_type_ids=token_type_ids,
inputs_embeds=inputs_embeds)
token_type_ids=token_type_ids)
if hasattr(self, "embeddings_project"):
hidden_states = self.embeddings_project(hidden_states)
embedding_output = self.embeddings_project(embedding_output)
hidden_states = self.encoder(
hidden_states,
extended_attention_mask
#head_mask=head_mask,
#output_attentions=output_attentions,
#output_hidden_states=output_hidden_states,
#return_dict=return_dict,
)
encoder_outputs = self.encoder(embedding_output, attention_mask)
return (hidden_states, )
return encoder_outputs
class ElectraForPretraining(ElectraPretrainedModel):
class ElectraDiscriminator(ElectraPretrainedModel):
def __init__(self, electra):
super(ElectraForPretraining, self).__init__()
super(ElectraDiscriminator, self).__init__()
self.electra = electra
self.discriminator_predictions = ElectraDiscriminatorPredictions(
......@@ -663,73 +422,23 @@ class ElectraForPretraining(ElectraPretrainedModel):
self.electra.config["hidden_act"])
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None, ):
r"""
labels (``paddle.Tensor`` dtype=ing64 of shape ``(batch_size, sequence_length)``, `optional`):
Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids`
docstring) Indices should be in ``[0, 1]``:
- 0 indicates the token is an original token,
- 1 indicates the token was replaced.
"""
return_dict = return_dict if return_dict is not None else self.return_dict
discriminator_hidden_states = self.electra(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict, )
discriminator_sequence_output = discriminator_hidden_states[0]
def forward(self,
input_ids,
token_type_ids=None,
position_ids=None,
attention_mask=None):
discriminator_sequence_output = self.electra(
input_ids, token_type_ids, position_ids, attention_mask)
logits = self.discriminator_predictions(discriminator_sequence_output)
loss = None
if labels is not None:
loss_fct = nn.BCEWithLogitsLoss()
if attention_mask is not None:
active_loss = paddle.reshape(
attention_mask,
[-1, discriminator_sequence_output.shape[1]]) == 1
active_logits = paddle.reshape(
logits,
[-1, discriminator_sequence_output.shape[1]])[active_loss]
active_labels = labels[active_loss]
loss = loss_fct(active_logits, active_labels.astype("float32"))
else:
loss = loss_fct(
paddle.reshape(
logits, [-1, discriminator_sequence_output.shape[1]]),
labels.astype("float32"))
if not return_dict:
output = (logits, ) + discriminator_hidden_states[1:]
return ((loss, ) + output) if loss is not None else output
return ElectraModelOutput(
loss=loss,
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions, )
class ElectraForMaskedLM(ElectraPretrainedModel):
return logits
class ElectraGenerator(ElectraPretrainedModel):
def __init__(self, electra):
super(ElectraForMaskedLM, self).__init__()
super(ElectraGenerator, self).__init__()
self.electra = electra
self.generator_predictions = ElectraGeneratorPredictions(
......@@ -753,74 +462,25 @@ class ElectraForMaskedLM(ElectraPretrainedModel):
def forward(self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
**kwargs):
r"""
labels (:obj:`paddle.Tensor` dtype = int64 of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
(masked), the loss is only computed for the tokens with labels in ``[0, ..., vocab_size]``
kwargs (:obj:`Dict[str, any]`, optional, defaults to `{}`):
Used to hide legacy arguments that have been deprecated.
"""
assert (kwargs == {}
), "Unexpected keyword arguments: {list(kwargs.keys())}."
return_dict = return_dict if return_dict is not None else self.return_dict
generator_hidden_states = self.electra(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict, )
generator_sequence_output = generator_hidden_states[0]
attention_mask=None):
generator_sequence_output = self.electra(input_ids, token_type_ids,
position_ids, attention_mask)
prediction_scores = self.generator_predictions(
generator_sequence_output)
if not self.tie_word_embeddings:
prediction_scores = self.generator_lm_head(prediction_scores)
else:
prediction_scores = F.linear(
prediction_scores = paddle.add(paddle.matmul(
prediction_scores,
self.get_input_embeddings().weight.transpose([1, 0]),
self.generator_lm_head_bias)
loss = None
# Masked language modeling softmax layer
if labels is not None:
loss_fct = nn.CrossEntropyLoss(
reduction='none') # -100 index = padding token
loss = loss_fct(
paddle.reshape(prediction_scores, [-1, self.vocab_size]),
paddle.reshape(labels, [-1]))
umask_positions = paddle.zeros_like(labels).astype("float32")
mask_positions = paddle.ones_like(labels).astype("float32")
mask_positions = paddle.where(labels == -100, umask_positions,
mask_positions)
loss = loss.sum() / mask_positions.sum()
if not return_dict:
output = (prediction_scores, ) + generator_hidden_states[1:]
return ((loss, ) + output) if loss is not None else output
return ElectraModelOutput(
loss=loss,
logits=prediction_scores,
hidden_states=generator_hidden_states.hidden_states,
attentions=generator_hidden_states.attentions, )
self.get_input_embeddings().weight,
transpose_y=True),
self.generator_lm_head_bias)
return prediction_scores
# class ElectraClassificationHead and ElectraForSequenceClassification for fine-tuning
......@@ -837,9 +497,7 @@ class ElectraClassificationHead(nn.Layer):
x = features[:, 0, :] # take <s> token (equiv. to [CLS])
x = self.dropout(x)
x = self.dense(x)
x = get_activation("gelu")(
x
) # although BERT uses tanh here, it seems Electra authors used gelu here
x = get_activation("gelu")(x) # Electra paper used gelu here
x = self.dropout(x)
x = self.out_proj(x)
return x
......@@ -856,50 +514,18 @@ class ElectraForSequenceClassification(ElectraPretrainedModel):
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None, ):
r"""
labels (:obj:`paddle.Tensor` dtype=int64 of shape :obj:`(batch_size,)`, `optional`):
Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
num_labels - 1]`. If :obj:`num_labels == 1` a regression loss is computed (Mean-Square loss),
If :obj:`num_labels > 1` a classification loss is computed (Cross-Entropy).
"""
return_dict = return_dict if return_dict is not None else self.return_dict
discriminator_hidden_states = self.electra(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict, )
sequence_output = discriminator_hidden_states[0]
logits = self.classifier(sequence_output)
def forward(self,
input_ids=None,
token_type_ids=None,
position_ids=None,
attention_mask=None):
sequence_output = self.electra(input_ids, token_type_ids, position_ids,
attention_mask)
loss = None
if not return_dict:
output = (logits, ) + discriminator_hidden_states[1:]
return ((loss, ) + output) if loss is not None else output
logits = self.classifier(sequence_output)
return ElectraModelOutput(
loss=loss,
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions, )
return logits
class ElectraForTokenClassification(ElectraPretrainedModel):
......@@ -912,51 +538,19 @@ class ElectraForTokenClassification(ElectraPretrainedModel):
self.num_labels)
self.init_weights()
def forward(
self,
input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None, ):
r"""
labels (:obj:`paddle.Tensor` dtype=int64 of shape :obj:`(batch_size, sequence_length)`, `optional`):
Labels for computing the token classification loss.
Indices should be in ``[0, ..., num_labels-1]``.
"""
return_dict = return_dict if return_dict is not None else self.return_dict
discriminator_hidden_states = self.electra(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
output_attentions,
output_hidden_states,
return_dict, )
discriminator_sequence_output = discriminator_hidden_states[0]
discriminator_sequence_output = self.dropout(
discriminator_sequence_output)
logits = self.classifier(discriminator_sequence_output)
loss = None
if not return_dict:
output = (logits, ) + discriminator_hidden_states[1:]
return ((loss, ) + output) if loss is not None else output
return ElectraModelOutput(
loss=loss,
logits=logits,
hidden_states=discriminator_hidden_states.hidden_states,
attentions=discriminator_hidden_states.attentions, )
def forward(self,
input_ids=None,
token_type_ids=None,
position_ids=None,
attention_mask=None):
sequence_output = self.electra(input_ids, token_type_ids, position_ids,
attention_mask)
sequence_output = self.dropout(sequence_output)
logits = self.classifier(sequence_output)
return logits
class ElectraForTotalPretraining(ElectraPretrainedModel):
......@@ -980,27 +574,26 @@ class ElectraForTotalPretraining(ElectraPretrainedModel):
else:
return None
def get_discriminator_inputs(self, inputs, raw_inputs, mlm_logits, masked,
use_softmax_sample):
"""Sample from the generator to create corrupted input."""
def get_discriminator_inputs(self, inputs, raw_inputs, gen_logits,
gen_labels, use_softmax_sample):
"""Sample from the generator to create discriminator input."""
# get generator token result
sampled_tokens = (self.sample_from_softmax(mlm_logits,
sampled_tokens = (self.sample_from_softmax(gen_logits,
use_softmax_sample)).detach()
#sampled_tokens = self.sample_from_softmax(mlm_logits)
sampled_tokids = paddle.argmax(sampled_tokens, axis=-1)
# update token only at mask position
# masked : [B, L], L contains -100(unmasked) or token value(masked)
# gen_labels : [B, L], L contains -100(unmasked) or token value(masked)
# mask_positions : [B, L], L contains 0(unmasked) or 1(masked)
umask_positions = paddle.zeros_like(masked)
mask_positions = paddle.ones_like(masked)
mask_positions = paddle.where(masked == -100, umask_positions,
umask_positions = paddle.zeros_like(gen_labels)
mask_positions = paddle.ones_like(gen_labels)
mask_positions = paddle.where(gen_labels == -100, umask_positions,
mask_positions)
updated_input = self.scatter_update(inputs, sampled_tokids,
updated_inputs = self.update_inputs(inputs, sampled_tokids,
mask_positions)
# use inputs and updated_input to generate labels
# use inputs and updated_input to get discriminator labels
labels = mask_positions * (paddle.ones_like(inputs) - paddle.equal(
updated_input, raw_inputs).astype("int32"))
return updated_input, labels, sampled_tokids
updated_inputs, raw_inputs).astype("int32"))
return updated_inputs, labels, sampled_tokids
def sample_from_softmax(self, logits, use_softmax_sample=True):
if use_softmax_sample:
......@@ -1010,24 +603,12 @@ class ElectraForTotalPretraining(ElectraPretrainedModel):
else:
gumbel_noise = paddle.zeros_like(logits)
# softmax_sample equal to sampled_tokids.unsqueeze(-1)
ins_softmax = nn.Softmax(axis=-1)
softmax_sample = paddle.argmax(
ins_softmax(logits + gumbel_noise), axis=-1)
F.softmax(logits + gumbel_noise), axis=-1)
# one hot
return F.one_hot(softmax_sample, logits.shape[-1])
def scatter_update(self, sequence, updates, positions):
"""Scatter-update a sequence.
Args:
sequence: A [batch_size, seq_len] or [batch_size, seq_len, depth] tensor
updates: A tensor of size batch_size*seq_len(*depth)
positions: A [batch_size, n_positions] tensor
Returns: A tuple of two tensors. First is a [batch_size, seq_len] or
[batch_size, seq_len, depth] tensor of "sequence" with elements at
"positions" replaced by the values at "updates." Updates to index 0 are
ignored. If there are duplicated positions the update is only applied once.
Second is a [batch_size, seq_len] mask tensor of which inputs were updated.
"""
def update_inputs(self, sequence, updates, positions):
shape = sequence.shape
assert (len(shape) == 2), "the dimension of inputs should be [B, L]"
B, L = shape
......@@ -1041,68 +622,59 @@ class ElectraForTotalPretraining(ElectraPretrainedModel):
return updated_sequence
def forward(
self,
input_ids=None,
raw_input_ids=None,
attention_mask=None,
token_type_ids=None,
position_ids=None,
head_mask=None,
inputs_embeds=None,
labels=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None, ):
r"""
labels (``paddle.Tensor`` dtype=int64 of shape ``(batch_size, sequence_length)``, `optional`):
Labels for computing the ELECTRA loss. Input should be a sequence of tokens (see :obj:`input_ids`
docstring) Indices should be in ``[0, 1]``:
- 0 indicates the token is an original token,
- 1 indicates the token was replaced.
Returns:
"""
return_dict = return_dict if return_dict is not None else self.return_dict
def forward(self,
input_ids=None,
token_type_ids=None,
position_ids=None,
attention_mask=None,
raw_input_ids=None,
gen_labels=None):
assert (
labels is not None
), "labels should not be None, please check DataCollatorForLanguageModeling"
generator_output = self.generator(
input_ids,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
labels,
output_attentions,
output_hidden_states,
return_dict, )
loss = generator_output[0] * self.gen_weight
logits = generator_output[1]
discriminator_inputs, discriminator_labels, generator_predict_tokens = self.get_discriminator_inputs(
input_ids, raw_input_ids, logits, labels, self.use_softmax_sample)
discriminator_output = self.discriminator(
discriminator_inputs,
attention_mask,
token_type_ids,
position_ids,
head_mask,
inputs_embeds,
discriminator_labels,
output_attentions,
output_hidden_states,
return_dict, )
loss += discriminator_output[0] * self.disc_weight
logits = discriminator_output[1]
if not return_dict:
return ((loss, ) + (logits, ))
return ElectraModelOutput(
loss=loss,
logits=logits,
hidden_states=generator_output.hidden_states,
attentions=generator_output.attentions, )
gen_labels is not None
), "gen_labels should not be None, please check DataCollatorForLanguageModeling"
gen_logits = self.generator(input_ids, token_type_ids, position_ids,
attention_mask)
disc_inputs, disc_labels, generator_predict_tokens = self.get_discriminator_inputs(
input_ids, raw_input_ids, gen_logits, gen_labels,
self.use_softmax_sample)
disc_logits = self.discriminator(disc_inputs, token_type_ids,
position_ids, attention_mask)
return gen_logits, disc_logits, disc_labels
class ElectraPretrainingCriterion(paddle.nn.Layer):
def __init__(self, vocab_size, gen_weight, disc_weight):
super(ElectraPretrainingCriterion, self).__init__()
self.vocab_size = vocab_size
self.gen_weight = gen_weight
self.disc_weight = disc_weight
self.gen_loss_fct = nn.CrossEntropyLoss(reduction='none')
self.disc_loss_fct = nn.BCEWithLogitsLoss()
def forward(self, generator_prediction_scores,
discriminator_prediction_scores, generator_labels,
discriminator_labels):
# generator loss
gen_loss = self.gen_loss_fct(
paddle.reshape(generator_prediction_scores, [-1, self.vocab_size]),
paddle.reshape(generator_labels, [-1]))
# todo: we can remove 4 lines after when CrossEntropyLoss(reduction='mean') improved
umask_positions = paddle.zeros_like(generator_labels).astype("float32")
mask_positions = paddle.ones_like(generator_labels).astype("float32")
mask_positions = paddle.where(generator_labels == -100, umask_positions,
mask_positions)
gen_loss = gen_loss.sum() / mask_positions.sum()
# discriminator loss
seq_length = discriminator_labels.shape[1]
disc_loss = self.disc_loss_fct(
paddle.reshape(discriminator_prediction_scores, [-1, seq_length]),
discriminator_labels.astype("float32"))
return self.gen_weight * gen_loss + self.disc_weight * disc_loss
......@@ -47,47 +47,32 @@ class ElectraTokenizer(PretrainedTokenizer):
resource_files_names = {"vocab_file": "vocab.txt"} # for save_pretrained
pretrained_resource_files_map = {
"vocab_file": {
"electra-small-generator":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-small-generator/vocab.txt",
"electra-base-generator":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-base-generator/vocab.txt",
"electra-large-generator":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-large-generator/vocab.txt",
"electra-small-discriminator":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-small-discriminator/vocab.txt",
"electra-base-discriminator":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-base-discriminator/vocab.txt",
"electra-large-discriminator":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-large-discriminator/vocab.txt",
"chinese-electra-discriminator-base":
"http://paddlenlp.bj.bcebos.com/models/transformers/chinese-electra-discriminator-base/vocab.txt",
"chinese-electra-discriminator-small":
"http://paddlenlp.bj.bcebos.com/models/transformers/chinese-electra-discriminator-small/vocab.txt",
"electra-small":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-small-vocab.txt",
"electra-base":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-base-vocab.txt",
"electra-large":
"https://paddlenlp.bj.bcebos.com/models/transformers/electra-large-vocab.txt",
"chinese-electra-base":
"http://paddlenlp.bj.bcebos.com/models/transformers/chinese-electra-base/vocab.txt",
"chinese-electra-small":
"http://paddlenlp.bj.bcebos.com/models/transformers/chinese-electra-small/vocab.txt",
}
}
pretrained_init_configuration = {
"electra-small-generator": {
"electra-small": {
"do_lower_case": True
},
"electra-base-generator": {
"electra-base": {
"do_lower_case": True
},
"electra-large-generator": {
"electra-large": {
"do_lower_case": True
},
"electra-small-discriminator": {
"chinese-electra-base": {
"do_lower_case": True
},
"electra-base-discriminator": {
"do_lower_case": True
},
"electra-large-discriminator": {
"do_lower_case": True
},
"chinese-electra-discriminator-base": {
"do_lower_case": True
},
"chinese-electra-discriminator-small": {
"chinese-electra-small": {
"do_lower_case": True
}
}
......@@ -163,15 +148,12 @@ class ElectraTokenizer(PretrainedTokenizer):
def num_special_tokens_to_add(self, pair=False):
"""
Returns the number of added tokens when encoding a sequence with special tokens.
Note:
This encodes inputs and checks the number of added tokens, and is therefore not efficient. Do not put this
inside your training loop.
Args:
pair: Returns the number of added tokens in the case of a sequence pair if set to True, returns the
number of added tokens in the case of a single sequence if set to False.
Returns:
Number of tokens added to sequences
"""
......@@ -190,13 +172,11 @@ class ElectraTokenizer(PretrainedTokenizer):
::
- single sequence: ``[CLS] X [SEP]``
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of input_id with the appropriate special tokens.
"""
......@@ -211,21 +191,16 @@ class ElectraTokenizer(PretrainedTokenizer):
token_ids_1=None):
"""
Create a mask from the two sequences passed to be used in a sequence-pair classification task.
A BERT sequence pair mask has the following format:
::
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
| first sequence | second sequence |
If :obj:`token_ids_1` is :obj:`None`, this method only returns the first portion of the mask (0s).
Args:
token_ids_0 (:obj:`List[int]`):
List of IDs.
token_ids_1 (:obj:`List[int]`, `optional`):
Optional second list of IDs for sequence pairs.
Returns:
:obj:`List[int]`: List of token_type_id according to the given sequence(s).
"""
......@@ -251,7 +226,6 @@ class ElectraTokenizer(PretrainedTokenizer):
"""
Returns a dictionary containing the encoded sequence or sequence pair and additional information:
the mask for sequence classification and the overflowing elements if a ``max_seq_len`` is specified.
Args:
text (:obj:`str`, :obj:`List[str]` or :obj:`List[int]`):
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using
......@@ -270,7 +244,6 @@ class ElectraTokenizer(PretrainedTokenizer):
model's max length.
truncation_strategy (:obj:`str`, `optional`, defaults to `longest_first`):
String selected in the following options:
- 'longest_first' (default) Iteratively reduce the inputs sequence until the input is under max_seq_len
starting from the longest one at each token (when there is a pair of input sequences)
- 'only_first': Only truncate the first sequence
......@@ -288,10 +261,8 @@ class ElectraTokenizer(PretrainedTokenizer):
Set to True to return overflowing token information (default False).
return_special_tokens_mask (:obj:`bool`, `optional`, defaults to :obj:`False`):
Set to True to return special tokens mask information (default False).
Return:
A Dictionary of shape::
{
input_ids: list[int],
position_ids: list[int] if return_position_ids is True (default)
......@@ -302,9 +273,7 @@ class ElectraTokenizer(PretrainedTokenizer):
num_truncated_tokens: int if a ``max_seq_len`` is specified and return_overflowing_tokens is True
special_tokens_mask: list[int] if return_special_tokens_mask is True
}
With the fields:
- ``input_ids``: list of token ids to be fed to a model
- ``position_ids``: list of token position ids to be fed to a model
- ``segment_ids``: list of token type ids to be fed to a model
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册