提交 5ed02755 编写于 作者: W wanghaoshuang

Normal the efficient loss and kd loss.

上级 51044022
......@@ -14,7 +14,7 @@ def main():
max_seq_len = 512
do_lower_case = True
batch_size = 32
epoch = 3
epoch = 30
processor = MnliProcessor(
data_dir=data_dir,
......@@ -23,8 +23,6 @@ def main():
do_lower_case=do_lower_case,
in_tokens=False)
valid_reader = processor.data_generator(
batch_size=batch_size, phase='dev', epoch=epoch, shuffle=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase='train',
......@@ -32,13 +30,22 @@ def main():
dev_count=1,
shuffle=True)
val_reader = processor.data_generator(
batch_size=batch_size,
phase='train',
epoch=epoch,
dev_count=1,
shuffle=True)
with fluid.dygraph.guard(place):
model = AdaBERTClassifier(3)
model = AdaBERTClassifier(
3,
teacher_model="/work/PaddleSlim/demo/bert_1/checkpoints/steps_23000"
)
searcher = DARTSearch(
model,
train_reader,
valid_reader,
learning_rate=0.001,
val_reader,
batchsize=batch_size,
num_epochs=epoch,
log_freq=10)
......
......@@ -41,12 +41,20 @@ __all__ = ["AdaBERTClassifier"]
class AdaBERTClassifier(Layer):
def __init__(self, num_labels, n_layer=12, emb_size=768):
def __init__(self, num_labels, n_layer=8, emb_size=768,
teacher_model=None):
super(AdaBERTClassifier, self).__init__()
self._n_layer = n_layer
self._num_labels = num_labels
self._emb_size = emb_size
self.teacher = BERTClassifier(num_labels)
print(
"----------------------load teacher model and test----------------------------------------"
)
self.teacher = BERTClassifier(num_labels, model_path=teacher_model)
# self.teacher.test("/work/PaddleSlim/demo/bert/data/glue_data/MNLI/")
print(
"----------------------finish load teacher model and test----------------------------------------"
)
self.student = BertModelLayer(
n_layer=self._n_layer, emb_size=self._emb_size)
......@@ -76,7 +84,7 @@ class AdaBERTClassifier(Layer):
def genotype(self):
return self.arch_parameters()
def loss(self, data_ids, beta=0.5, gamma=0.5):
def loss(self, data_ids, beta=4, gamma=0.8):
T = 1.0
src_ids = data_ids[0]
position_ids = data_ids[1]
......@@ -98,10 +106,20 @@ class AdaBERTClassifier(Layer):
# define kd loss
kd_losses = []
kd_weights = []
for i in range(len(next_sent_feats)):
j = int(np.ceil(i * (float(len(t_logits)) / len(next_sent_feats))))
kd_weights.append(t_losses[j].numpy())
kd_weights = 1 / np.array(kd_weights)
kd_weights = np.exp(kd_weights - np.max(kd_weights))
kd_weights = kd_weights / kd_weights.sum(axis=0)
for i in range(len(next_sent_feats)):
j = np.ceil(i * (len(next_sent_feats) / len(logits)))
j = int(np.ceil(i * (float(len(t_logits)) / len(next_sent_feats))))
t_logit = t_logits[j]
t_loss = t_losses[j]
s_sent_feat = next_sent_feats[i]
fc = self.cls_fc[i]
s_sent_feat = fluid.layers.dropout(
......@@ -115,22 +133,25 @@ class AdaBERTClassifier(Layer):
t_probs.stop_gradient = False
kd_loss = t_probs * fluid.layers.log(s_probs / T)
kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1)
kd_loss = fluid.layers.reduce_mean(kd_loss, dim=0)
kd_loss = kd_loss / t_loss
kd_loss = kd_loss * kd_weights[i]
kd_losses.append(kd_loss)
kd_loss = fluid.layers.sum(kd_losses)
kd_loss = fluid.layers.reduce_mean(kd_loss, dim=0)
# define ce loss
ce_loss = fluid.layers.cross_entropy(s_probs, labels)
ce_loss = fluid.layers.mean(x=ce_loss) * k_i
ce_loss = fluid.layers.reduce_mean(ce_loss) * k_i
# define e loss
model_size = fluid.layers.sum(model_size)
flops = fluid.layers.sum(flops)
model_size = fluid.layers.sum(
model_size) / self.student.max_model_size()
flops = fluid.layers.sum(flops) / self.student.max_flops()
e_loss = (len(next_sent_feats) * k_i / self._n_layer) * (
flops + model_size)
# define total loss
loss = (1 - gamma) * ce_loss - gamma * kd_loss + beta * e_loss
print("ce_loss: {}; kd_loss: {}; e_loss: {}".format((
1 - gamma) * ce_loss.numpy(), -gamma * kd_loss.numpy(), beta *
e_loss.numpy()))
return loss
......@@ -82,6 +82,12 @@ class BertModelLayer(Layer):
self._encoder = EncoderLayer(
n_layer=self._n_layer, d_model=self._emb_size)
def max_flops(self):
return self._encoder.max_flops
def max_model_size(self):
return self._encoder.max_model_size
def arch_parameters(self):
return [self._encoder.alphas]
......
......@@ -64,8 +64,8 @@ OPS = {
'dil_conv_3': lambda : ConvBN(1, 1, filter_size=3, dilation=2),
'dil_conv_5': lambda : ConvBN(1, 1, filter_size=5, dilation=2),
'dil_conv_7': lambda : ConvBN(1, 1, filter_size=7, dilation=2),
'avg_pool_3': lambda : Pool2D(pool_size=(3, 1), pool_type='avg'),
'max_pool_3': lambda : Pool2D(pool_size=(3, 1), pool_type='max'),
'avg_pool_3': lambda : Pool2D(pool_size=(3, 1), pool_padding=(1, 0), pool_type='avg'),
'max_pool_3': lambda : Pool2D(pool_size=(3, 1), pool_padding=(1, 0), pool_type='max'),
'none': lambda : Zero(),
'skip_connect': lambda : Identity(),
}
......@@ -76,10 +76,13 @@ class MixedOp(fluid.dygraph.Layer):
super(MixedOp, self).__init__()
ops = [OPS[primitive]() for primitive in PRIMITIVES]
self._ops = fluid.dygraph.LayerList(ops)
self.max_flops = max([FLOPs[primitive] for primitive in PRIMITIVES])
self.max_model_size = max(
[ModelSize[primitive] for primitive in PRIMITIVES])
def forward(self, x, weights, flops=[], model_size=[]):
for i in range(len(self._ops)):
if weights[i] != 0:
if weights[i].numpy() != 0:
flops.append(FLOPs.values()[i] * weights[i])
model_size.append(ModelSize.values()[i] * weights[i])
return self._ops[i](x) * weights[i]
......@@ -135,8 +138,8 @@ class ConvBN(fluid.dygraph.Layer):
self.conv_layer = Conv2D(
in_ch,
out_ch, [filter_size, 1],
dilation=dilation,
padding=[(filter_size - 1) // 2, 0],
dilation=[dilation, 1],
padding=[(filter_size - 1) * dilation // 2, 0],
param_attr=conv_param,
bias_attr=False,
act=None,
......@@ -154,10 +157,14 @@ class Cell(fluid.dygraph.Layer):
super(Cell, self).__init__()
self._steps = steps
self.max_flops = 0
self.max_model_size = 0
ops = []
for i in range(self._steps):
for j in range(2 + i):
op = MixedOp()
self.max_flops += op.max_flops
self.max_model_size += op.max_model_size
ops.append(op)
self._ops = fluid.dygraph.LayerList(ops)
......@@ -191,10 +198,16 @@ class EncoderLayer(Layer):
self._n_layer = n_layer
self._d_model = d_model
self._steps = 3
self.max_flops = 0
self.max_model_size = 0
cells = []
for i in range(n_layer):
cells.append(Cell(steps=self._steps))
cell = Cell(steps=self._steps)
cells.append(cell)
self.max_flops += cell.max_flops
self.max_model_size += cell.max_model_size
self._cells = fluid.dygraph.LayerList(cells)
k = sum(1 for i in range(self._steps) for n in range(2 + i))
......@@ -222,7 +235,7 @@ class EncoderLayer(Layer):
[-1, 1, enc_input.shape[1], self._d_model])
alphas = gumbel_softmax(self.alphas)
k = gumbel_softmax(self.k)
k = fluid.layers.reshape(gumbel_softmax(self.k), [-1])
outputs = []
s0 = s1 = tmp
......@@ -235,7 +248,7 @@ class EncoderLayer(Layer):
enc_output = fluid.layers.reshape(
s1, [-1, enc_input.shape[1], self._d_model])
outputs.append(enc_output)
if k[i] != 0:
if k[i].numpy() != 0:
outputs[-1] = outputs[-1] * k[i]
break
return outputs, k[i]
return outputs, k[i]
return None
......@@ -19,6 +19,7 @@ from __future__ import print_function
__all__ = ['DARTSearch']
import logging
from itertools import izip
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.dygraph.base import to_variable
......@@ -75,10 +76,8 @@ class DARTSearch(object):
objs = AvgrageMeter()
self.model.train()
for step_id, (
train_data,
valid_data) in enumerate(zip(train_loader(), valid_loader())):
step_id = 0
for train_data, valid_data in izip(train_loader(), valid_loader()):
if epoch >= self.epochs_no_archopt:
architect.step(train_data, valid_data)
......@@ -95,11 +94,13 @@ class DARTSearch(object):
optimizer.minimize(loss, grad_clip)
self.model.clear_gradients()
objs.update(loss.numpy(), self.batchsize)
batch_size = train_data[0].shape[0]
objs.update(loss.numpy(), batch_size)
if step_id % self.log_freq == 0:
logger.info("Train Epoch {}, Step {}, loss {:.6f}".format(
epoch, step_id, objs.avg[0]))
step_id += 1
return objs.avg[0]
def valid_one_epoch(self, valid_loader, epoch):
......
......@@ -86,17 +86,20 @@ class BERTClassifier(Layer):
self.cls_model = ClsModelLayer(
self.bert_config, num_labels, return_pooled_out=True)
if self.init_pretraining_params:
print("Load pre-trained model from %s" %
self.init_pretraining_params)
init_from_static_model(self.init_pretraining_params,
self.cls_model, self.bert_config)
if model_path is not None:
#restore the model
print("Load params from %s" % model_path)
model_dict, _ = fluid.load_dygraph(model_path)
self.cls_model.load_dict(model_dict)
elif self.init_pretraining_params:
print("Load pre-trained model from %s" %
self.init_pretraining_params)
init_from_static_model(self.init_pretraining_params,
self.cls_model, self.bert_config)
else:
raise Exception(
"You should load pretrained model for training this teacher model."
)
def forward(self, input):
return self.cls_model(input)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册