提交 9240f31c 编写于 作者: B baiyfbupt

update basekd

上级 4233d6a8
......@@ -10,26 +10,34 @@ import pickle
import logging
from paddleslim.common import AvgrageMeter, get_logger
from paddleslim.nas.darts import count_parameters_in_MB
logger = get_logger(__name__, level=logging.INFO)
def valid_one_epoch(model, valid_loader, epoch, log_freq):
accs = AvgrageMeter()
ce_losses = AvgrageMeter()
model.student.eval()
t_accs = AvgrageMeter()
model.eval()
step_id = 0
for valid_data in valid_loader():
try:
loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch)
loss, acc, ce_loss, _, _, t_acc = model._layers.loss(valid_data,
epoch)
except:
loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch)
loss, acc, ce_loss, _, _, t_acc = model.loss(valid_data, epoch)
batch_size = valid_data[0].shape[0]
ce_losses.update(ce_loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
t_accs.update(t_acc.numpy(), batch_size)
step_id += 1
return ce_losses.avg[0], accs.avg[0]
return ce_losses.avg[0], accs.avg[0], t_accs.avg[0]
def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel,
......@@ -38,18 +46,19 @@ def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel,
accs = AvgrageMeter()
ce_losses = AvgrageMeter()
kd_losses = AvgrageMeter()
model.student.train()
t_accs = AvgrageMeter()
model.train()
step_id = 0
for train_data in train_loader():
batch_size = train_data[0].shape[0]
if use_data_parallel:
total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss(
total_loss, acc, ce_loss, kd_loss, _, t_acc = model._layers.loss(
train_data, epoch)
else:
total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data,
epoch)
total_loss, acc, ce_loss, kd_loss, _, t_acc = model.loss(
train_data, epoch)
if use_data_parallel:
total_loss = model.scale_loss(total_loss)
......@@ -63,19 +72,23 @@ def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel,
accs.update(acc.numpy(), batch_size)
ce_losses.update(ce_loss.numpy(), batch_size)
kd_losses.update(kd_loss.numpy(), batch_size)
t_accs.update(t_acc.numpy(), batch_size)
if step_id % log_freq == 0:
logger.info(
"Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f};".
"Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f}, teacher_acc {:.6f};".
format(epoch, step_id,
optimizer.current_step_lr(), total_losses.avg[0],
ce_losses.avg[0], kd_losses.avg[0], accs.avg[0]))
optimizer.current_step_lr(), total_losses.avg[
0], ce_losses.avg[0], kd_losses.avg[0], accs.avg[0],
t_accs.avg[0]))
step_id += 1
return total_losses.avg[0], accs.avg[0]
def main():
# whether use multi-gpus
use_data_parallel = False
device_num = fluid.dygraph.parallel.Env().nranks
use_data_parallel = device_num > 1
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
).dev_id) if use_data_parallel else fluid.CUDAPlace(0)
......@@ -88,12 +101,12 @@ def main():
max_seq_len = 128
batch_size = 192
hidden_size = 768
hidden_size = 128
emb_size = 768
epoch = 80
log_freq = 10
log_freq = 1
task_name = 'mnli'
task_name = 'mrpc'
if task_name == 'mrpc':
data_dir = "./data/glue_data/MRPC/"
......@@ -110,7 +123,6 @@ def main():
num_labels = 3
processor_func = MnliProcessor
device_num = fluid.dygraph.parallel.Env().nranks
use_fixed_gumbel = True
train_phase = "train"
val_phase = "dev"
......@@ -129,7 +141,11 @@ def main():
emb_size=emb_size,
teacher_model=teacher_model_dir,
data_dir=data_dir,
use_fixed_gumbel=use_fixed_gumbel)
use_fixed_gumbel=use_fixed_gumbel,
t=1.0)
logger.info("param size = {:.6f}MB".format(
count_parameters_in_MB(model.student.parameters())))
learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch)
......@@ -174,7 +190,8 @@ def main():
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
return_list=True,
use_multiprocess=True)
dev_loader = fluid.io.DataLoader.from_generator(
capacity=128,
use_double_buffer=True,
......@@ -190,14 +207,18 @@ def main():
best_valid_acc = 0
for epoch_id in range(epoch):
train_one_epoch(model, train_loader, optimizer, epoch_id,
use_data_parallel, log_freq)
loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq)
total_loss, train_acc = train_one_epoch(
model, train_loader, optimizer, epoch_id, use_data_parallel,
log_freq)
logger.info("train set, total_loss {:.6f}; acc {:.6f};".format(
total_loss, train_acc))
loss, acc, t_acc = valid_one_epoch(model, dev_loader, epoch_id,
log_freq)
if acc > best_valid_acc:
best_valid_acc = acc
logger.info(
"dev set, ce_loss {:.6f}; acc {:.6f}, best_acc {:.6f};".format(
loss, acc, best_valid_acc))
"dev set, ce_loss {:.6f}; teacher_acc: {:.6f}, acc {:.6f}, best_acc {:.6f};".
format(loss, t_acc, acc, best_valid_acc))
if __name__ == '__main__':
......
......@@ -57,7 +57,7 @@ class AdaBERTClassifier(Layer):
use_fixed_gumbel=False,
gumbel_alphas=None,
fix_emb=False,
t=5.0):
t=1.0):
super(AdaBERTClassifier, self).__init__()
self._n_layer = n_layer
self._num_labels = num_labels
......@@ -78,8 +78,9 @@ class AdaBERTClassifier(Layer):
self.teacher = BERTClassifier(
num_labels, task_name=task_name, model_path=self._teacher_model)
# global setting, will be overwritten when training(about 1% acc loss)
self.teacher.eval()
self.teacher.test(self._data_dir)
self.teacher.eval()
print(
"----------------------finish load teacher model and test----------------------------------------"
)
......@@ -116,49 +117,67 @@ class AdaBERTClassifier(Layer):
def loss(self, data_ids, epoch):
labels = data_ids[4]
s_logits = self.student(data_ids, epoch)
s_logits, s_fea = self.student(data_ids, epoch)
# make sure techer is compute in eval mode
self.teacher.eval()
t_total_loss, t_logits, t_losses, t_accs, _, t_fea = self.teacher(
data_ids)
if self.student.training:
self.student.train()
t_logits[-1].stop_gradient = True
#kd_loss = fluid.layers.mse_loss(s_logits[-1], t_logits[-1])
#kd_loss = fluid.layers.mse_loss(s_fea, t_fea)
t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids)
#kd_loss = fluid.layers.reduce_sum(fluid.layers.square(s_logits[-1] - t_logits[-1]))
t_probs = fluid.layers.softmax(t_logits[-1] / self.T)
s_probs = fluid.layers.softmax(s_logits[-1] / self.T)
kd_loss = fluid.layers.reduce_mean(
fluid.layers.cross_entropy(
input=s_probs, label=t_probs, soft_label=True))
#define kd loss
kd_weights = []
for i in range(len(s_logits)):
j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
kd_weights.append(t_losses[j].numpy())
kd_weights = np.array(kd_weights)
kd_weights = np.squeeze(kd_weights)
kd_weights = to_variable(kd_weights)
kd_weights = fluid.layers.softmax(-kd_weights)
kd_losses = []
for i in range(len(s_logits)):
j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
t_logit = t_logits[j]
s_logit = s_logits[i]
t_logit.stop_gradient = True
t_probs = fluid.layers.softmax(t_logit) # P_j^T
s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S
#kd_loss = -t_probs * fluid.layers.log(s_probs)
kd_loss = fluid.layers.cross_entropy(
input=s_probs, label=t_probs, soft_label=True)
kd_loss = fluid.layers.reduce_mean(kd_loss)
kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i])
kd_losses.append(kd_loss)
kd_loss = fluid.layers.sum(kd_losses)
# kd_weights = []
# for i in range(len(s_logits)):
# j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
# kd_weights.append(t_losses[j].numpy())
# kd_weights = np.array(kd_weights)
# kd_weights = np.squeeze(kd_weights)
# kd_weights = to_variable(kd_weights)
# kd_weights = fluid.layers.softmax(-kd_weights)
# kd_losses = []
# for i in range(len(s_logits)):
# j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
# t_logit = t_logits[j]
# s_logit = s_logits[i]
# t_logit.stop_gradient = True
# t_probs = fluid.layers.softmax(t_logit) # P_j^T
# s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S
# #kd_loss = -t_probs * fluid.layers.log(s_probs)
# kd_loss = fluid.layers.cross_entropy(
# input=s_probs, label=t_probs, soft_label=True)
# kd_loss = fluid.layers.reduce_mean(kd_loss)
# kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i])
# kd_losses.append(kd_loss)
# kd_loss = fluid.layers.sum(kd_losses)
losses = []
for logit in s_logits:
for logit in [s_logits[-1]]:
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logit, label=labels, return_softmax=True)
#print("training: ", self.student.training, probs.numpy())
loss = fluid.layers.mean(x=ce_loss)
losses.append(loss)
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
accuracy = fluid.layers.accuracy(input=probs, label=labels)
ce_loss = fluid.layers.sum(losses)
total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss
#total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss
total_loss = kd_loss
return total_loss, accuracy, ce_loss, kd_loss, s_logits
return total_loss, accuracy, ce_loss, kd_loss, s_logits, t_accs[-1]
......@@ -91,6 +91,11 @@ class BertModelLayer(Layer):
output_dim=self._hidden_size,
param_attr=fluid.ParamAttr(name="s_emb_factorization"))
self._emb_fac_1 = Linear(
input_dim=self._emb_size,
output_dim=self._hidden_size,
param_attr=fluid.ParamAttr(name="s_emb_factorization_1"))
self._encoder = EncoderLayer(
num_labels=num_labels,
n_layer=self._n_layer,
......@@ -103,10 +108,6 @@ class BertModelLayer(Layer):
return self._src_emb.parameters() + self._pos_emb.parameters(
) + self._sent_emb.parameters()
def emb_names(self):
return self._src_emb.parameters() + self._pos_emb.parameters(
) + self._sent_emb.parameters()
def max_flops(self):
return self._encoder.max_flops
......@@ -129,6 +130,6 @@ class BertModelLayer(Layer):
emb_out_1 = self._emb_fac(src_emb_1)
# (bs, seq_len, hidden_size)
enc_outputs = self._encoder(emb_out_0, emb_out_1, epoch)
enc_outputs, fea = self._encoder(emb_out_0, emb_out_1, epoch)
return enc_outputs
return enc_outputs, fea
......@@ -45,8 +45,8 @@ OPS = {
'avg_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='avg'),
'max_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='max'),
'none': lambda n_channel, name: Zero(),
'skip_connect': lambda n_channel, name: Identity(),
'none': lambda n_channel, name: Zero(),
}
......@@ -61,10 +61,10 @@ class MixedOp(fluid.dygraph.Layer):
if 'pool' in primitive:
gama = ParamAttr(
initializer=fluid.initializer.Constant(value=1),
trainable=False)
trainable=True)
beta = ParamAttr(
initializer=fluid.initializer.Constant(value=0),
trainable=False)
trainable=True)
BN = BatchNorm(n_channel, param_attr=gama, bias_attr=beta)
op = fluid.dygraph.Sequential(op, BN)
ops.append(op)
......@@ -125,7 +125,7 @@ class ReluConvBN(fluid.dygraph.Layer):
filter_size=[3, 1],
dilation=1,
stride=1,
affine=False,
affine=True,
use_cudnn=True,
name=None):
super(ReluConvBN, self).__init__()
......@@ -210,40 +210,40 @@ class EncoderLayer(Layer):
super(EncoderLayer, self).__init__()
self._n_layer = n_layer
self._hidden_size = hidden_size
self._n_channel = 128
self._n_channel = hidden_size
self._steps = 3
self._n_ops = len(ConvBN_PRIMITIVES)
self.use_fixed_gumbel = use_fixed_gumbel
self.stem0 = fluid.dygraph.Sequential(
Conv2D(
num_channels=1,
num_filters=self._n_channel,
filter_size=[3, self._hidden_size],
padding=[1, 0],
param_attr=fluid.ParamAttr(initializer=MSRA()),
bias_attr=False),
BatchNorm(
num_channels=self._n_channel,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0))))
self.stem1 = fluid.dygraph.Sequential(
Conv2D(
num_channels=1,
num_filters=self._n_channel,
filter_size=[3, self._hidden_size],
padding=[1, 0],
param_attr=fluid.ParamAttr(initializer=MSRA()),
bias_attr=False),
BatchNorm(
num_channels=self._n_channel,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0))))
# self.stem0 = fluid.dygraph.Sequential(
# Conv2D(
# num_channels=1,
# num_filters=self._n_channel,
# filter_size=[3, self._hidden_size],
# padding=[1, 0],
# param_attr=fluid.ParamAttr(initializer=MSRA()),
# bias_attr=False),
# BatchNorm(
# num_channels=self._n_channel,
# param_attr=fluid.ParamAttr(
# initializer=fluid.initializer.Constant(value=1)),
# bias_attr=fluid.ParamAttr(
# initializer=fluid.initializer.Constant(value=0))))
# self.stem1 = fluid.dygraph.Sequential(
# Conv2D(
# num_channels=1,
# num_filters=self._n_channel,
# filter_size=[3, self._hidden_size],
# padding=[1, 0],
# param_attr=fluid.ParamAttr(initializer=MSRA()),
# bias_attr=False),
# BatchNorm(
# num_channels=self._n_channel,
# param_attr=fluid.ParamAttr(
# initializer=fluid.initializer.Constant(value=1)),
# bias_attr=fluid.ParamAttr(
# initializer=fluid.initializer.Constant(value=0))))
cells = []
for i in range(n_layer):
......@@ -271,10 +271,10 @@ class EncoderLayer(Layer):
num_channels=self._n_channel,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1),
trainable=False),
trainable=True),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0),
trainable=False))
trainable=True))
out = Linear(
self._n_channel,
num_labels,
......@@ -311,17 +311,28 @@ class EncoderLayer(Layer):
s0 = fluid.layers.unsqueeze(enc_input_0, [1])
s1 = fluid.layers.unsqueeze(enc_input_1, [1])
s0 = self.stem0(s0)
s1 = self.stem1(s1)
s0 = fluid.layers.transpose(s0, [0, 3, 2, 1])
s1 = fluid.layers.transpose(s1, [0, 3, 2, 1])
# s0 = self.stem0(s0)
# s1 = self.stem1(s1)
enc_outputs = []
fea = []
for i in range(self._n_layer):
s0, s1 = s1, self._cells[i](s0, s1, alphas)
# (bs, n_channel, seq_len, 1)
tmp = self._bns[i](s1)
tmp = s1
tmp = self.pool2d_avg(tmp)
tmp = fluid.layers.reshape(tmp, shape=[-1, 0])
fea.append(tmp)
tmp = fluid.layers.dropout(
x=tmp,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
tmp = self._outs[i](tmp)
enc_outputs.append(tmp)
return enc_outputs
return enc_outputs, fea[-1]
......@@ -120,12 +120,16 @@ class BERTClassifier(Layer):
test_data_generator = processor.data_generator(
batch_size=batch_size, phase='dev', epoch=1, shuffle=False)
# test train mode test_acc
self.cls_model.eval()
print("test with test mode:...")
total_cost, final_acc, avg_acc, total_num_seqs = [], [], [], []
for batch in test_data_generator():
data_ids = create_data(batch)
total_loss, _, _, np_acces, np_num_seqs = self.cls_model(data_ids)
total_loss, _, _, np_acces, np_num_seqs, fea = self.cls_model(
data_ids)
np_loss = total_loss.numpy()
np_acc = np_acces[-1].numpy()
......
......@@ -118,4 +118,5 @@ class ClsModelLayer(Layer):
accuracys.append(accuracy)
total_loss = fluid.layers.sum(losses)
return total_loss, logits, losses, accuracys, num_seqs
return total_loss, logits, losses, accuracys, num_seqs, next_sent_feat[
-1]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册