提交 343c64cc 编写于 作者: W wanghaoshuang

Update bert DARTS

上级 649ffd9e
......@@ -30,207 +30,88 @@ import numpy as np
import multiprocessing
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable, Layer
from paddle.fluid.dygraph import to_variable, Layer, Linear
from .reader.cls import *
from .model.bert import BertConfig
from .model.cls import ClsModelLayer
from .model.bert import BertModelLayer
from .optimization import Optimizer
from .utils.init import init_from_static_model
__all__ = ["ConvBERTClassifier"]
def create_data(batch):
"""
convert data to variable
"""
src_ids = to_variable(batch[0], "src_ids")
position_ids = to_variable(batch[1], "position_ids")
sentence_ids = to_variable(batch[2], "sentence_ids")
input_mask = to_variable(batch[3], "input_mask")
labels = to_variable(batch[4], "labels")
labels.stop_gradient = True
return src_ids, position_ids, sentence_ids, input_mask, labels
class ConvBERTClassifier(Layer):
def __init__(self,
num_labels,
task_name="mnli",
model_path=None,
use_cuda=True):
super(ConvBERTClassifier, self).__init__()
self.task_name = task_name.lower()
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/"
bert_config_path = BERT_BASE_PATH + "/bert_config.json"
self.vocab_path = BERT_BASE_PATH + "/vocab.txt"
self.init_pretraining_params = BERT_BASE_PATH + "/dygraph_params/"
self.do_lower_case = True
self.bert_config = BertConfig(bert_config_path)
if use_cuda:
self.dev_count = fluid.core.get_cuda_device_count()
else:
self.dev_count = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
self.trainer_count = fluid.dygraph.parallel.Env().nranks
self.processors = {
'xnli': XnliProcessor,
'cola': ColaProcessor,
'mrpc': MrpcProcessor,
'mnli': MnliProcessor,
}
self.cls_model = ClsModelLayer(
self.bert_config, num_labels, return_pooled_out=True)
if model_path is not None:
#restore the model
print("Load params from %s" % model_path)
model_dict, _ = fluid.load_dygraph(model_path)
self.cls_model.load_dict(model_dict)
def forward(self, input):
return self.cls_model(input)
def test(self, data_dir, batch_size=64, max_seq_len=512):
processor = self.processors[self.task_name](
data_dir=data_dir,
vocab_path=self.vocab_path,
max_seq_len=max_seq_len,
do_lower_case=self.do_lower_case,
in_tokens=False)
test_data_generator = processor.data_generator(
batch_size=batch_size, phase='dev', epoch=1, shuffle=False)
self.cls_model.eval()
total_cost, final_acc, avg_acc, total_num_seqs = [], [], [], []
for batch in test_data_generator():
data_ids = create_data(batch)
total_loss, _, _, np_acces, np_num_seqs = self.cls_model(data_ids)
np_loss = total_loss.numpy()
np_acc = np_acces[-1].numpy()
np_avg_acc = np.mean([acc.numpy() for acc in np_acces])
np_num_seqs = np_num_seqs.numpy()
total_cost.extend(np_loss * np_num_seqs)
final_acc.extend(np_acc * np_num_seqs)
avg_acc.extend(np_avg_acc * np_num_seqs)
total_num_seqs.extend(np_num_seqs)
print("[evaluation] classifier[-1] average acc: %f; average acc: %f" %
(np.sum(final_acc) / np.sum(total_num_seqs),
np.sum(avg_acc) / np.sum(total_num_seqs)))
self.cls_model.train()
def fit(self,
data_dir,
epoch,
batch_size=64,
use_cuda=True,
max_seq_len=512,
warmup_proportion=0.1,
use_data_parallel=False,
learning_rate=0.00005,
weight_decay=0.01,
lr_scheduler="linear_warmup_decay",
skip_steps=10,
save_steps=1000,
checkpoints="checkpoints"):
processor = self.processors[self.task_name](
data_dir=data_dir,
vocab_path=self.vocab_path,
max_seq_len=max_seq_len,
do_lower_case=self.do_lower_case,
in_tokens=False,
random_seed=5512)
shuffle_seed = 1 if self.trainer_count > 1 else None
train_data_generator = processor.data_generator(
batch_size=batch_size,
phase='train',
epoch=epoch,
dev_count=self.trainer_count,
shuffle=True,
shuffle_seed=shuffle_seed)
num_train_examples = processor.get_num_examples(phase='train')
max_train_steps = epoch * num_train_examples // batch_size // self.trainer_count
warmup_steps = int(max_train_steps * warmup_proportion)
print("Device count: %d" % self.dev_count)
print("Trainer count: %d" % self.trainer_count)
print("Num train examples: %d" % num_train_examples)
print("Max train steps: %d" % max_train_steps)
print("Num warmup steps: %d" % warmup_steps)
if use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
optimizer = Optimizer(
warmup_steps=warmup_steps,
num_train_steps=max_train_steps,
learning_rate=learning_rate,
model_cls=self.cls_model,
weight_decay=weight_decay,
scheduler=lr_scheduler,
loss_scaling=1.0,
parameter_list=self.cls_model.parameters())
if use_data_parallel:
self.cls_model = fluid.dygraph.parallel.DataParallel(
self.cls_model, strategy)
train_data_generator = fluid.contrib.reader.distributed_batch_reader(
train_data_generator)
steps = 0
time_begin = time.time()
for batch in train_data_generator():
data_ids = create_data(batch)
total_loss, logits, losses, accuracys, num_seqs = self.cls_model(
data_ids)
optimizer.optimization(
losses[-1],
use_data_parallel=use_data_parallel,
model=self.cls_model)
self.cls_model.clear_gradients()
if steps != 0 and steps % skip_steps == 0:
time_end = time.time()
used_time = time_end - time_begin
current_example, current_epoch = processor.get_train_progress()
localtime = time.asctime(time.localtime(time.time()))
print(
"%s, epoch: %s, steps: %s, dy_graph loss: %f, acc: %f, speed: %f steps/s"
% (localtime, current_epoch, steps, total_loss.numpy(),
accuracys[-1].numpy(), skip_steps / used_time))
time_begin = time.time()
if steps != 0 and steps % save_steps == 0 and fluid.dygraph.parallel.Env(
).local_rank == 0:
self.test(data_dir, batch_size=64, max_seq_len=512)
save_path = os.path.join(checkpoints,
"steps" + "_" + str(steps))
fluid.save_dygraph(self.cls_model.state_dict(), save_path)
fluid.save_dygraph(optimizer.optimizer.state_dict(), save_path)
print("Save model parameters and optimizer status at %s" %
save_path)
steps += 1
if fluid.dygraph.parallel.Env().local_rank == 0:
save_path = os.path.join(checkpoints, "final")
fluid.save_dygraph(self.cls_model.state_dict(), save_path)
fluid.save_dygraph(optimizer.optimizer.state_dict(), save_path)
print("Save model parameters and optimizer status at %s" %
save_path)
from paddleslim.teachers.bert import BERTClassifier
__all__ = ["AdaBERTClassifier"]
class AdaBERTClassifier(Layer):
def __init__(self, num_labels, n_layer=12, emb_size=768):
super(AdaBERTClassifier, self).__init__()
self._n_layer = n_layer
self._num_labels = num_labels
self._emb_size = emb_size
self.teacher = BERTClassifier(num_labels)
self.student = BertModelLayer(
n_layer=self._n_layer, emb_size=self._emb_size)
self.cls_fc = list()
for i in range(self._n_layer):
fc = Linear(
input_dim=self._emb_size,
output_dim=self._num_labels,
param_attr=fluid.ParamAttr(
name="s_cls_out_%d_w" % i,
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="s_cls_out_%d_b" % i,
initializer=fluid.initializer.Constant(0.)))
fc = self.add_sublayer("cls_fc_%d" % i, fc)
self.cls_fc.append(fc)
def forward(self, data_ids):
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
return self.student(src_ids, position_ids, sentence_ids)
def arch_parameters(self):
return self.student.arch_parameters()
def genotype(self):
return self.arch_parameters()
def loss(self, data_ids, beta=0.5, gamma=0.5):
T = 1.0
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
input_mask = data_ids[3]
labels = data_ids[4]
enc_outputs, next_sent_feats = self.student(src_ids, position_ids,
sentence_ids)
self.teacher.eval()
total_loss, logits, losses, accuracys, num_seqs = self.teacher(
data_ids)
kd_losses = []
for t_logits, t_loss, s_sent_feat, fc in zip(
logits, losses, next_sent_feats, self.cls_fc):
s_sent_feat = fluid.layers.dropout(
x=s_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
s_logits = fc(s_sent_feat)
t_probs = fluid.layers.softmax(t_logits)
s_probs = fluid.layers.softmax(s_logits)
kd_loss = t_probs * fluid.layers.log(s_probs / T)
kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1)
kd_loss = fluid.layers.reduce_mean(kd_loss, dim=0)
kd_loss = kd_loss / t_loss
kd_losses.append(kd_loss)
kd_loss = fluid.layers.sum(kd_losses)
ce_loss = fluid.layers.cross_entropy(s_probs, labels)
ce_loss = fluid.layers.mean(x=ce_loss)
e_loss = 1 # to be done
loss = (1 - gamma) * ce_loss + gamma * kd_loss + beta * e_loss
return loss
......@@ -20,62 +20,38 @@ from __future__ import print_function
import six
import json
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, to_variable, Layer, guard
from .transformer_encoder import EncoderLayer
class BertConfig(object):
def __init__(self, config_path):
self._config_dict = self._parse(config_path)
def _parse(self, config_path):
try:
with open(config_path) as json_file:
config_dict = json.load(json_file)
except Exception:
raise IOError("Error in parsing bert model config file '%s'" %
config_path)
else:
return config_dict
def __getitem__(self, key):
return self._config_dict[key]
def print_config(self):
for arg, value in sorted(six.iteritems(self._config_dict)):
print('%s: %s' % (arg, value))
print('------------------------------------------------')
class BertModelLayer(Layer):
"""
bert
"""
def __init__(self, config, return_pooled_out=True, use_fp16=False):
def __init__(self,
emb_size=768,
n_layer=12,
voc_size=30522,
max_position_seq_len=512,
sent_types=2,
return_pooled_out=True,
initializer_range=1.0,
use_fp16=False):
super(BertModelLayer, self).__init__()
self._emb_size = config['hidden_size']
self._n_layer = config['num_hidden_layers']
self._n_head = config['num_attention_heads']
self._voc_size = config['vocab_size']
self._max_position_seq_len = config['max_position_embeddings']
self._sent_types = config['type_vocab_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._attention_dropout = config['attention_probs_dropout_prob']
self._emb_size = emb_size
self._n_layer = n_layer
self._voc_size = voc_size
self._max_position_seq_len = max_position_seq_len
self._sent_types = sent_types
self.return_pooled_out = return_pooled_out
self._word_emb_name = "word_embedding"
self._pos_emb_name = "pos_embedding"
self._sent_emb_name = "sent_embedding"
self._word_emb_name = "s_word_embedding"
self._pos_emb_name = "s_pos_embedding"
self._sent_emb_name = "s_sent_embedding"
self._dtype = "float16" if use_fp16 else "float32"
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
scale=initializer_range)
self._src_emb = Embedding(
size=[self._voc_size, self._emb_size],
......@@ -99,14 +75,17 @@ class BertModelLayer(Layer):
input_dim=self._emb_size,
output_dim=self._emb_size,
param_attr=fluid.ParamAttr(
name="pooled_fc.w_0", initializer=self._param_initializer),
bias_attr="pooled_fc.b_0",
name="s_pooled_fc.w_0", initializer=self._param_initializer),
bias_attr="s_pooled_fc.b_0",
act="tanh")
self._encoder = EncoderLayer(
n_layer=self._n_layer, d_model=self._emb_size)
def forward(self, src_ids, position_ids, sentence_ids, input_mask):
def arch_parameters(self):
return [self._encoder.alphas]
def forward(self, src_ids, position_ids, sentence_ids):
"""
forward
"""
......@@ -131,113 +110,3 @@ class BertModelLayer(Layer):
next_sent_feats.append(next_sent_feat)
return enc_outputs, next_sent_feats
class PretrainModelLayer(Layer):
"""
pretrain model
"""
def __init__(self,
config,
return_pooled_out=True,
weight_sharing=True,
use_fp16=False):
super(PretrainModelLayer, self).__init__()
self.config = config
self._voc_size = config['vocab_size']
self._emb_size = config['hidden_size']
self._hidden_act = config['hidden_act']
self._prepostprocess_dropout = config['hidden_dropout_prob']
self._word_emb_name = "word_embedding"
self._param_initializer = fluid.initializer.TruncatedNormal(
scale=config['initializer_range'])
self._weight_sharing = weight_sharing
self.use_fp16 = use_fp16
self._dtype = "float16" if use_fp16 else "float32"
self.bert_layer = BertModelLayer(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
self.pre_process_layer = PrePostProcessLayer(
"n", self._emb_size, self._prepostprocess_dropout, "pre_encoder")
self.pooled_fc = Linear(
input_dim=self._emb_size,
output_dim=self._emb_size,
param_attr=fluid.ParamAttr(
name="mask_lm_trans_fc.w_0",
initializer=self._param_initializer),
bias_attr="mask_lm_trans_fc.b_0",
act="tanh")
self.mask_lm_out_bias_attr = fluid.ParamAttr(
name="mask_lm_out_fc.b_0",
initializer=fluid.initializer.Constant(value=0.0))
if not self._weight_sharing:
self.out_fc = Linear(
input_dim=self._emb_size,
output_dim=self._voc_size,
param_attr=fluid.ParamAttr(
name="mask_lm_out_fc.w_0",
initializer=self._param_initializer),
bias_attr=self.mask_lm_out_bias_attr)
else:
self.fc_create_params = self.create_parameter(
shape=[self._voc_size],
dtype=self._dtype,
attr=self.mask_lm_out_bias_attr,
is_bias=True)
self.next_sent_fc = Linear(
input_dim=self._emb_size,
output_dim=2,
param_attr=fluid.ParamAttr(
name="next_sent_fc.w_0", initializer=self._param_initializer),
bias_attr="next_sent_fc.b_0")
def forward(self, src_ids, position_ids, sentence_ids, input_mask,
mask_label, mask_pos, labels):
"""
forward
"""
mask_pos = fluid.layers.cast(x=mask_pos, dtype='int32')
enc_output, next_sent_feat = self.bert_layer(src_ids, position_ids,
sentence_ids, input_mask)
reshaped_emb_out = fluid.layers.reshape(
x=enc_output, shape=[-1, self._emb_size])
mask_feat = fluid.layers.gather(input=reshaped_emb_out, index=mask_pos)
mask_trans_feat = self.pooled_fc(mask_feat)
mask_trans_feat = self.pre_process_layer(None, mask_trans_feat, "n",
self._prepostprocess_dropout)
if self._weight_sharing:
fc_out = fluid.layers.matmul(
x=mask_trans_feat,
y=self.bert_layer._src_emb._w,
transpose_y=True)
fc_out += self.fc_create_params
else:
fc_out = self.out_fc(mask_trans_feat)
mask_lm_loss = fluid.layers.softmax_with_cross_entropy(
logits=fc_out, label=mask_label)
mean_mask_lm_loss = fluid.layers.mean(mask_lm_loss)
next_sent_fc_out = self.next_sent_fc(next_sent_feat)
next_sent_loss, next_sent_softmax = fluid.layers.softmax_with_cross_entropy(
logits=next_sent_fc_out, label=labels, return_softmax=True)
next_sent_acc = fluid.layers.accuracy(
input=next_sent_softmax, label=labels)
mean_next_sent_loss = fluid.layers.mean(next_sent_loss)
loss = mean_next_sent_loss + mean_mask_lm_loss
return next_sent_acc, mean_mask_lm_loss, loss
......@@ -26,6 +26,7 @@ import paddle.fluid as fluid
from paddle.fluid.dygraph import Linear, Layer
from .bert import BertModelLayer
from paddleslim.teachers.bert import BERTClassifier
class ClsModelLayer(Layer):
......
......@@ -44,7 +44,7 @@ class DARTSearch(object):
batchsize=64,
num_imgs=50000,
arch_learning_rate=3e-4,
unrolled='False',
unrolled=False,
num_epochs=50,
epochs_no_archopt=0,
use_gpu=True,
......@@ -73,32 +73,16 @@ class DARTSearch(object):
def train_one_epoch(self, train_loader, valid_loader, architect, optimizer,
epoch):
objs = AvgrageMeter()
top1 = AvgrageMeter()
top5 = AvgrageMeter()
self.model.train()
for step_id, (
train_data,
valid_data) in enumerate(zip(train_loader(), valid_loader())):
train_image, train_label = train_data
valid_image, valid_label = valid_data
train_image = to_variable(train_image)
train_label = to_variable(train_label)
train_label.stop_gradient = True
valid_image = to_variable(valid_image)
valid_label = to_variable(valid_label)
valid_label.stop_gradient = True
n = train_image.shape[0]
if epoch >= self.epochs_no_archopt:
architect.step(train_image, train_label, valid_image,
valid_label)
architect.step(train_data, valid_data)
logits = self.model(train_image)
prec1 = fluid.layers.accuracy(input=logits, label=train_label, k=1)
prec5 = fluid.layers.accuracy(input=logits, label=train_label, k=5)
loss = fluid.layers.reduce_mean(
fluid.layers.softmax_with_cross_entropy(logits, train_label))
loss = self.model.loss(train_data)
if self.use_data_parallel:
loss = self.model.scale_loss(loss)
......@@ -111,16 +95,12 @@ class DARTSearch(object):
optimizer.minimize(loss, grad_clip)
self.model.clear_gradients()
objs.update(loss.numpy(), n)
top1.update(prec1.numpy(), n)
top5.update(prec5.numpy(), n)
objs.update(loss.numpy(), self.batchsize)
if step_id % self.log_freq == 0:
logger.info(
"Train Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[
0]))
return top1.avg[0]
logger.info("Train Epoch {}, Step {}, loss {:.6f}".format(
epoch, step_id, objs.avg[0]))
return objs.avg[0]
def valid_one_epoch(self, valid_loader, epoch):
objs = AvgrageMeter()
......@@ -128,7 +108,7 @@ class DARTSearch(object):
top5 = AvgrageMeter()
self.model.eval()
for step_id, (image, label) in enumerate(valid_loader):
for step_id, valid_data in enumerate(valid_loader):
image = to_variable(image)
label = to_variable(label)
n = image.shape[0]
......@@ -204,13 +184,11 @@ class DARTSearch(object):
genotype = self.model.genotype()
logger.info('genotype = %s', genotype)
train_top1 = self.train_one_epoch(train_loader, valid_loader,
architect, optimizer, epoch)
logger.info("Epoch {}, train_acc {:.6f}".format(epoch, train_top1))
self.train_one_epoch(train_loader, valid_loader, architect,
optimizer, epoch)
if epoch == self.num_epochs - 1:
valid_top1 = self.valid_one_epoch(valid_loader, epoch)
logger.info("Epoch {}, valid_acc {:.6f}".format(epoch,
valid_top1))
# valid_top1 = self.valid_one_epoch(valid_loader, epoch)
logger.info("Epoch {}, valid_acc {:.6f}".format(epoch, 1))
if save_parameters:
fluid.save_dygraph(self.model.state_dict(), "./weights")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册