提交 70c6b708 编写于 作者: W wanghaoshuang

Update bert based DARTS demo

上级 d3e51bfc
......@@ -16,8 +16,9 @@ __all__ = ['AvgrageMeter']
class AvgrageMeter(object):
def __init__(self):
def __init__(self, format="{}"):
self.reset()
self._format = format
def reset(self):
self.avg = 0
......@@ -28,3 +29,6 @@ class AvgrageMeter(object):
self.sum += val * n
self.cnt += n
self.avg = self.sum / self.cnt
def __repr__(self):
return self._format.format(self.avg)
......@@ -56,7 +56,10 @@ class Architect(object):
else:
loss = self._backward_step(valid_data)
self.optimizer.minimize(loss)
# print("alphas gradient: {}".format(self.model.arch_parameters()[0].gradient()))
self.optimizer.clear_gradients()
return self.model.arch_parameters()[0].gradient()
def _backward_step(self, valid_data):
loss = self.model.loss(valid_data)
......
......@@ -50,7 +50,9 @@ class AdaBERTClassifier(Layer):
beta=4,
conv_type="conv_bn",
search_layer=True,
teacher_model=None):
teacher_model=None,
alphas=None,
k=None):
super(AdaBERTClassifier, self).__init__()
self._n_layer = n_layer
self._num_labels = num_labels
......@@ -60,6 +62,8 @@ class AdaBERTClassifier(Layer):
self._beta = beta
self._conv_type = conv_type
self._search_layer = search_layer
self._alphas = alphas
self._k = k
print(
"----------------------load teacher model and test----------------------------------------"
)
......@@ -86,20 +90,36 @@ class AdaBERTClassifier(Layer):
bias_attr=fluid.ParamAttr(
name="s_cls_out_%d_b" % i,
initializer=fluid.initializer.Constant(0.)))
fc = self.add_sublayer("cls_fc_%d" % i, fc)
fc = self.add_sublayer("s_cls_fc_%d" % i, fc)
self.cls_fc.append(fc)
def forward(self, data_ids):
def forward(self, data_ids, alphas=None, k=None):
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
return self.student(src_ids, position_ids, sentence_ids)
return self.student(
src_ids,
position_ids,
sentence_ids,
alphas=self._alphas,
k=self._k)
def arch_parameters(self):
return self.student.arch_parameters()
def model_parameters(self):
model_parameters = [
p for p in self.student.parameters()
if p.name not in [a.name for a in self.arch_parameters()]
]
return model_parameters
def genotype(self):
return self.arch_parameters()
alphas = self.arch_parameters()[0].numpy()
alphas = [np.argmax(edge) for edge in alphas]
k = np.argmax(self.arch_parameters()[1].numpy())
return "layers: {}; edges: {} ".format(k, alphas)
def new(self):
model_new = AdaBERTClassifier(
......@@ -108,8 +128,7 @@ class AdaBERTClassifier(Layer):
)
return model_new
def loss(self, data_ids):
T = 1.0
def valid(self, data_ids):
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
......@@ -117,16 +136,57 @@ class AdaBERTClassifier(Layer):
labels = data_ids[4]
flops = []
model_size = []
alphas = self.arch_parameters()[0].numpy(
) if self._alphas is None else self._alphas
k = self.arch_parameters()[1].numpy() if self._k is None else self._k
print(alphas.shape)
print(k.shape)
enc_outputs, next_sent_feats, k_i = self.student(
src_ids,
position_ids,
sentence_ids,
flops=flops,
model_size=model_size)
model_size=model_size,
alphas=alphas,
k=k)
logits = self.cls_fc[-1](next_sent_feats[-1])
probs = fluid.layers.softmax(logits)
accuracy = fluid.layers.accuracy(input=probs, label=labels)
model_size = np.sum(model_size)
flops = np.sum(flops)
ret = {
"accuracy": accuracy.numpy(),
"model_size(MB)": model_size / 1e6,
"FLOPs(M)": flops / 1e6
}
return ret
def loss(self, data_ids):
T = 1.0
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
input_mask = data_ids[3]
labels = data_ids[4]
flops = []
model_size = []
self.teacher.eval()
total_loss, t_logits, t_losses, accuracys, num_seqs = self.teacher(
data_ids)
self.teacher.train()
enc_outputs, next_sent_feats, k_i = self.student(
src_ids,
position_ids,
sentence_ids,
flops=flops,
model_size=model_size,
alphas=self._alphas,
k=self._k)
# define kd loss
kd_losses = []
......@@ -140,21 +200,16 @@ class AdaBERTClassifier(Layer):
kd_weights = np.exp(kd_weights - np.max(kd_weights))
kd_weights = kd_weights / kd_weights.sum(axis=0)
s_probs = None
for i in range(len(next_sent_feats)):
j = int(np.ceil(i * (float(len(t_logits)) / len(next_sent_feats))))
t_logit = t_logits[j]
s_sent_feat = next_sent_feats[i]
fc = self.cls_fc[i]
s_sent_feat = fluid.layers.dropout(
x=s_sent_feat,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
s_logits = fc(s_sent_feat)
t_logit.stop_gradient = True
t_probs = fluid.layers.softmax(t_logit)
s_probs = fluid.layers.softmax(s_logits)
t_probs.stop_gradient = True
kd_loss = t_probs * fluid.layers.log(s_probs / T)
kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1)
kd_loss = kd_loss * kd_weights[i]
......@@ -167,17 +222,28 @@ class AdaBERTClassifier(Layer):
ce_loss = fluid.layers.cross_entropy(s_probs, labels)
ce_loss = fluid.layers.reduce_mean(ce_loss) * k_i
len_model_size = len(model_size)
# define e loss
model_size = fluid.layers.sum(model_size)
# print("model_size: {}".format(model_size.numpy()/1e6))
if self._alphas is not None:
flops = np.sum(flops)
model_size = np.sum(model_size)
else:
flops = fluid.layers.sum(flops)
model_size = fluid.layers.sum(model_size)
model_size = model_size / self.student.max_model_size()
flops = fluid.layers.sum(flops) / self.student.max_flops()
e_loss = (len(next_sent_feats) * k_i / self._n_layer) * (
flops + model_size)
flops = flops / self.student.max_flops()
e_loss = (flops + model_size) * (len(next_sent_feats) * k_i /
self._n_layer)
print(
"len(next_sent_feats): {}; k_i: {}; flops: {}; model_size: {}; len: {}".
format(
len(next_sent_feats), k_i,
flops.numpy(), model_size.numpy(), len_model_size))
# define total loss
loss = (1 - self._gamma
) * ce_loss - self._gamma * kd_loss + self._beta * e_loss
# print("ce_loss: {}; kd_loss: {}; e_loss: {}".format((
# 1 - gamma) * ce_loss.numpy(), -gamma * kd_loss.numpy(), beta *
# e_loss.numpy()))
return loss, ce_loss, kd_loss, e_loss
# loss = ce_loss + self._beta * e_loss
# return loss, ce_loss, ce_loss, e_loss
......@@ -110,10 +110,9 @@ class BertModelLayer(Layer):
position_ids,
sentence_ids,
flops=[],
model_size=[]):
"""
forward
"""
model_size=[],
alphas=None,
k=None):
src_emb = self._src_emb(src_ids)
pos_emb = self._pos_emb(position_ids)
sent_emb = self._sent_emb(sentence_ids)
......@@ -122,9 +121,8 @@ class BertModelLayer(Layer):
emb_out = emb_out + sent_emb
emb_out = self._emb_fac(emb_out)
enc_outputs, k_i = self._encoder(
emb_out, flops=flops, model_size=model_size)
emb_out, flops=flops, model_size=model_size, alphas=alphas, k=k)
if not self.return_pooled_out:
return enc_outputs
......
......@@ -57,12 +57,12 @@ class ClsModelLayer(Layer):
fc = Linear(
input_dim=self.config["hidden_size"],
output_dim=num_labels,
param_attr=fluid.ParamAttr(
param_attr=fluid.paramattr(
name="cls_out_%d_w" % i,
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.truncatednormal(scale=0.02)),
bias_attr=fluid.paramattr(
name="cls_out_%d_b" % i,
initializer=fluid.initializer.Constant(0.)))
initializer=fluid.initializer.constant(0.)))
fc = self.add_sublayer("cls_fc_%d" % i, fc)
self.cls_fc.append(fc)
......
......@@ -18,6 +18,7 @@ from __future__ import division
from __future__ import print_function
import numpy as np
from collections import Iterable
import paddle
import paddle.fluid as fluid
......@@ -25,13 +26,13 @@ from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, Conv2D, Ba
from paddle.fluid.initializer import NormalInitializer
GConv_PRIMITIVES = [
'std_gconv_3', 'std_gconv_5', 'std_gconv_7', 'dil_gconv_3', 'dil_gconv_5',
'dil_gconv_7', 'avg_pool_3', 'max_pool_3', 'none', 'skip_connect'
'none', 'std_gconv_3', 'std_gconv_5', 'std_gconv_7', 'dil_gconv_3',
'dil_gconv_5', 'dil_gconv_7', 'avg_pool_3', 'max_pool_3', 'skip_connect'
]
ConvBN_PRIMITIVES = [
'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3',
'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3', 'none',
'none', 'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3',
'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3',
'skip_connect'
]
......@@ -117,7 +118,11 @@ class MixedOp(fluid.dygraph.Layer):
def forward(self, x, weights, flops=[], model_size=[]):
for i in range(len(self._ops)):
if weights[i].numpy() != 0:
if isinstance(weights, Iterable):
weights_i = weights[i]
else:
weights_i = weights[i].numpy()
if weights_i != 0:
flops.append(FLOPs.values()[i] * weights[i])
model_size.append(ModelSize.values()[i] * weights[i])
return self._ops[i](x) * weights[i]
......@@ -166,6 +171,7 @@ class ConvBNRelu(fluid.dygraph.Layer):
use_cudnn=True,
name=None):
super(ConvBNRelu, self).__init__()
self._name = name
conv_std = (2.0 /
(filter_size[0] * filter_size[1] * out_c * in_c))**0.5
conv_param = fluid.ParamAttr(
......@@ -187,7 +193,7 @@ class ConvBNRelu(fluid.dygraph.Layer):
def forward(self, inputs):
conv = self.conv(inputs)
bn = self.bn(conv)
return bn
return conv
class GateConv(fluid.dygraph.Layer):
......@@ -261,24 +267,30 @@ class Cell(fluid.dygraph.Layer):
states = [s0, s1]
offset = 0
for i in range(self._steps):
s = fluid.layers.sums([
self._ops[offset + j](h,
weights[offset + j],
flops=flops,
model_size=model_size)
for j, h in enumerate(states)
])
edges = []
for j, h in enumerate(states):
edge = self._ops[offset + j](h,
weights[offset + j],
flops=flops,
model_size=model_size)
edges.append(edge)
s = edges[0]
for n in range(1, len(edges)):
s = s + edges[n]
# s = fluid.layers.sums(edges)
offset += len(states)
states.append(s)
out = fluid.layers.sum(states[-self._steps:])
states = states[-self._steps:]
out = states[0]
for n in range(1, len(states)):
out = out + states[n]
# out = fluid.layers.sums(states[-self._steps:])
return out
class EncoderLayer(Layer):
"""
encoder
"""
def __init__(self,
n_layer,
hidden_size=768,
......@@ -342,15 +354,17 @@ class EncoderLayer(Layer):
default_initializer=NormalInitializer(
loc=0.0, scale=1e-3))
def forward(self, enc_input, flops=[], model_size=[]):
def forward(self, enc_input, flops=[], model_size=[], alphas=None, k=None):
tmp = fluid.layers.reshape(
enc_input, [-1, 1, enc_input.shape[1],
self._hidden_size]) #(bs, 1, seq_len, hidden_size)
tmp = self.conv0(tmp) # (bs, hidden_size, seq_len, 1)
alphas = gumbel_softmax(self.alphas)
k = fluid.layers.reshape(gumbel_softmax(self.k), [-1])
if alphas is None:
alphas = gumbel_softmax(self.alphas)
if k is None:
k = fluid.layers.reshape(gumbel_softmax(self.k), [-1])
outputs = []
s0 = s1 = tmp
......@@ -364,7 +378,11 @@ class EncoderLayer(Layer):
enc_output, [-1, enc_output.shape[1],
self._hidden_size]) # (bs, seq_len, hidden_size)
outputs.append(enc_output)
if self._search_layer and k[i].numpy() != 0:
if isinstance(k, Iterable):
k_i = k[i]
else:
k_i = k[i].numpy()
if k_i != 0:
outputs[-1] = outputs[-1] * k[i]
return outputs, k[i]
return outputs, 1.0
......@@ -18,10 +18,12 @@ from __future__ import print_function
__all__ = ['DARTSearch']
import math
import logging
from itertools import izip
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.framework import Variable
from paddle.fluid.dygraph.base import to_variable
from ...common import AvgrageMeter, get_logger
from .architect import Architect
......@@ -41,6 +43,7 @@ class DARTSearch(object):
model,
train_reader,
valid_reader,
test_reader=None,
learning_rate=0.025,
batchsize=64,
num_imgs=50000,
......@@ -54,6 +57,7 @@ class DARTSearch(object):
self.model = model
self.train_reader = train_reader
self.valid_reader = valid_reader
self.test_reader = test_reader
self.learning_rate = learning_rate
self.batchsize = batchsize
self.num_imgs = num_imgs
......@@ -82,10 +86,13 @@ class DARTSearch(object):
step_id = 0
for train_data, valid_data in izip(train_loader(), valid_loader()):
if epoch >= self.epochs_no_archopt:
architect.step(train_data, valid_data)
alphas_grad = architect.step(train_data, valid_data)
loss, ce_loss, kd_loss, e_loss = self.model.loss(train_data)
if math.isnan(e_loss.numpy()):
print("alphas_grad: {}".format(alphas_grad))
print("alphas: {}".format(self.model.arch_parameters()[0]
.numpy()))
if self.use_data_parallel:
loss = self.model.scale_loss(loss)
loss.backward()
......@@ -93,67 +100,55 @@ class DARTSearch(object):
else:
loss.backward()
grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5)
optimizer.minimize(loss, grad_clip)
# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5)
# optimizer.minimize(loss, grad_clip)
optimizer.minimize(loss)
self.model.clear_gradients()
batch_size = train_data[0].shape[0]
objs.update(loss.numpy(), batch_size)
e_loss = e_loss.numpy() if isinstance(e_loss, Variable) else e_loss
ce_losses.update(ce_loss.numpy(), batch_size)
kd_losses.update(kd_loss.numpy(), batch_size)
e_losses.update(e_loss.numpy(), batch_size)
e_losses.update(e_loss, batch_size)
if step_id % self.log_freq == 0:
#logger.info("Train Epoch {}, Step {}, loss {:.6f}; ce: {:.6f}; kd: {:.6f}; e: {:.6f}".format(
# epoch, step_id, objs.avg[0], ce_losses.avg[0], kd_losses.avg[0], e_losses.avg[0]))
logger.info(
"Train Epoch {}, Step {}, loss {}; ce: {}; kd: {}; e: {}".
format(epoch, step_id,
loss.numpy(),
ce_loss.numpy(), kd_loss.numpy(), e_loss.numpy()))
ce_loss.numpy(), kd_loss.numpy(), e_loss))
step_id += 1
return objs.avg[0]
def valid_one_epoch(self, valid_loader, epoch):
objs = AvgrageMeter()
top1 = AvgrageMeter()
top5 = AvgrageMeter()
self.model.eval()
meters = {}
for step_id, valid_data in enumerate(valid_loader):
image = to_variable(image)
label = to_variable(label)
n = image.shape[0]
logits = self.model(image)
prec1 = fluid.layers.accuracy(input=logits, label=label, k=1)
prec5 = fluid.layers.accuracy(input=logits, label=label, k=5)
loss = fluid.layers.reduce_mean(
fluid.layers.softmax_with_cross_entropy(logits, label))
objs.update(loss.numpy(), n)
top1.update(prec1.numpy(), n)
top5.update(prec5.numpy(), n)
ret = self.model.valid(valid_data)
for key, value in ret.items():
if key not in meters:
meters[key] = AvgrageMeter()
meters[key].update(value, 1)
if step_id % self.log_freq == 0:
logger.info(
"Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}".
format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[
0]))
return top1.avg[0]
logger.info("Valid Epoch {}, Step {}, {}".format(
epoch, step_id, meters))
def train(self):
if self.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
model_parameters = [
p for p in self.model.parameters()
if p.name not in [a.name for a in self.model.arch_parameters()]
]
logger.info("param size = {:.6f}MB".format(
model_parameters = self.model.model_parameters()
logger.info("parameter size in super net: {:.6f}M".format(
count_parameters_in_MB(model_parameters)))
step_per_epoch = int(self.num_imgs * 0.5 / self.batchsize)
if self.unrolled:
step_per_epoch *= 2
learning_rate = fluid.dygraph.CosineDecay(
self.learning_rate, step_per_epoch, self.num_epochs)
optimizer = fluid.optimizer.MomentumOptimizer(
learning_rate,
0.9,
......@@ -167,6 +162,9 @@ class DARTSearch(object):
self.train_reader)
self.valid_reader = fluid.contrib.reader.distributed_batch_reader(
self.valid_reader)
if self.test_reader is not None:
self.test_reader = fluid.contrib.reader.distributed_batch_reader(
self.test_reader)
train_loader = fluid.io.DataLoader.from_generator(
capacity=64,
......@@ -182,6 +180,17 @@ class DARTSearch(object):
train_loader.set_batch_generator(self.train_reader, places=self.place)
valid_loader.set_batch_generator(self.valid_reader, places=self.place)
if self.test_reader is not None:
test_loader = fluid.io.DataLoader.from_generator(
capacity=64,
use_double_buffer=True,
iterable=True,
return_list=True)
test_loader.set_batch_generator(
self.test_reader, places=self.place)
else:
test_loader = valid_loader
architect = Architect(self.model, learning_rate,
self.arch_learning_rate, self.place,
self.unrolled)
......@@ -199,8 +208,8 @@ class DARTSearch(object):
self.train_one_epoch(train_loader, valid_loader, architect,
optimizer, epoch)
if epoch == self.num_epochs - 1:
# valid_top1 = self.valid_one_epoch(valid_loader, epoch)
logger.info("Epoch {}, valid_acc {:.6f}".format(epoch, 1))
if save_parameters:
fluid.save_dygraph(self.model.state_dict(), "./weights")
# if epoch == self.num_epochs - 1:
# self.valid_one_epoch(test_loader, epoch)
# if save_parameters:
# fluid.save_dygraph(self.model.state_dict(), "./weights")
......@@ -57,7 +57,7 @@ class PrePostProcessLayer(Layer):
elif cmd == "d": # add dropout
if dropout_rate:
self.functors.append(lambda x: fluid.layers.dropout(
x, dropout_prob=dropout_rate, is_test=False))
x, dropout_prob=dropout_rate, is_test=True))
self.exec_order += "d"
def forward(self, x, residual=None):
......@@ -111,8 +111,8 @@ class PositionwiseFeedForwardLayer(Layer):
hidden = fluid.layers.dropout(
hidden,
dropout_prob=self._dropout_rate,
upscale_in_train="upscale_in_train",
is_test=False)
# upscale_in_train="upscale_in_train",
is_test=True)
out = self._h2o(hidden)
return out
......@@ -218,13 +218,13 @@ class MultiHeadAttentionLayer(Layer):
#alpha=self._d_model**-0.5)
if attn_bias is not None:
product += attn_bias
weights = fluid.layers.softmax(product)
weights = fluid.layers.softmax(product) # 48
if self._dropout_rate:
weights_droped = fluid.layers.dropout(
weights,
dropout_prob=self._dropout_rate,
dropout_implementation="upscale_in_train",
is_test=False)
# dropout_implementation="upscale_in_train",
is_test=True)
out = fluid.layers.matmul(weights_droped, transpose_v)
else:
out = fluid.layers.matmul(weights, transpose_v)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册