diff --git a/paddleslim/common/meter.py b/paddleslim/common/meter.py index 6770257971a75018d011de444715729f23a6b778..b9032756478ec37d64fc26f29f2e7542e60d71b0 100644 --- a/paddleslim/common/meter.py +++ b/paddleslim/common/meter.py @@ -16,8 +16,9 @@ __all__ = ['AvgrageMeter'] class AvgrageMeter(object): - def __init__(self): + def __init__(self, format="{}"): self.reset() + self._format = format def reset(self): self.avg = 0 @@ -28,3 +29,6 @@ class AvgrageMeter(object): self.sum += val * n self.cnt += n self.avg = self.sum / self.cnt + + def __repr__(self): + return self._format.format(self.avg) diff --git a/paddleslim/nas/darts/architect.py b/paddleslim/nas/darts/architect.py index 4a0eed12d2acce895a1b2672829375cd4b5fbe49..012a5f7cf78da544d1c219e1910106e07801235f 100644 --- a/paddleslim/nas/darts/architect.py +++ b/paddleslim/nas/darts/architect.py @@ -56,7 +56,10 @@ class Architect(object): else: loss = self._backward_step(valid_data) self.optimizer.minimize(loss) + +# print("alphas gradient: {}".format(self.model.arch_parameters()[0].gradient())) self.optimizer.clear_gradients() + return self.model.arch_parameters()[0].gradient() def _backward_step(self, valid_data): loss = self.model.loss(valid_data) diff --git a/paddleslim/nas/darts/search_space/conv_bert/cls.py b/paddleslim/nas/darts/search_space/conv_bert/cls.py index e3540f31f70342b03e1d0693ce25db17e6f3eb04..e3e554fc2900c73f9979a8d4ad1e322e7e153ef3 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/cls.py +++ b/paddleslim/nas/darts/search_space/conv_bert/cls.py @@ -50,7 +50,9 @@ class AdaBERTClassifier(Layer): beta=4, conv_type="conv_bn", search_layer=True, - teacher_model=None): + teacher_model=None, + alphas=None, + k=None): super(AdaBERTClassifier, self).__init__() self._n_layer = n_layer self._num_labels = num_labels @@ -60,6 +62,8 @@ class AdaBERTClassifier(Layer): self._beta = beta self._conv_type = conv_type self._search_layer = search_layer + self._alphas = alphas + self._k = k print( "----------------------load teacher model and test----------------------------------------" ) @@ -86,20 +90,36 @@ class AdaBERTClassifier(Layer): bias_attr=fluid.ParamAttr( name="s_cls_out_%d_b" % i, initializer=fluid.initializer.Constant(0.))) - fc = self.add_sublayer("cls_fc_%d" % i, fc) + fc = self.add_sublayer("s_cls_fc_%d" % i, fc) self.cls_fc.append(fc) - def forward(self, data_ids): + def forward(self, data_ids, alphas=None, k=None): src_ids = data_ids[0] position_ids = data_ids[1] sentence_ids = data_ids[2] - return self.student(src_ids, position_ids, sentence_ids) + return self.student( + src_ids, + position_ids, + sentence_ids, + alphas=self._alphas, + k=self._k) def arch_parameters(self): return self.student.arch_parameters() + def model_parameters(self): + + model_parameters = [ + p for p in self.student.parameters() + if p.name not in [a.name for a in self.arch_parameters()] + ] + return model_parameters + def genotype(self): - return self.arch_parameters() + alphas = self.arch_parameters()[0].numpy() + alphas = [np.argmax(edge) for edge in alphas] + k = np.argmax(self.arch_parameters()[1].numpy()) + return "layers: {}; edges: {} ".format(k, alphas) def new(self): model_new = AdaBERTClassifier( @@ -108,8 +128,7 @@ class AdaBERTClassifier(Layer): ) return model_new - def loss(self, data_ids): - T = 1.0 + def valid(self, data_ids): src_ids = data_ids[0] position_ids = data_ids[1] sentence_ids = data_ids[2] @@ -117,16 +136,57 @@ class AdaBERTClassifier(Layer): labels = data_ids[4] flops = [] model_size = [] + alphas = self.arch_parameters()[0].numpy( + ) if self._alphas is None else self._alphas + k = self.arch_parameters()[1].numpy() if self._k is None else self._k + + print(alphas.shape) + print(k.shape) + enc_outputs, next_sent_feats, k_i = self.student( src_ids, position_ids, sentence_ids, flops=flops, - model_size=model_size) + model_size=model_size, + alphas=alphas, + k=k) + + logits = self.cls_fc[-1](next_sent_feats[-1]) + probs = fluid.layers.softmax(logits) + accuracy = fluid.layers.accuracy(input=probs, label=labels) + + model_size = np.sum(model_size) + flops = np.sum(flops) + ret = { + "accuracy": accuracy.numpy(), + "model_size(MB)": model_size / 1e6, + "FLOPs(M)": flops / 1e6 + } + return ret + def loss(self, data_ids): + T = 1.0 + src_ids = data_ids[0] + position_ids = data_ids[1] + sentence_ids = data_ids[2] + input_mask = data_ids[3] + labels = data_ids[4] + flops = [] + model_size = [] self.teacher.eval() total_loss, t_logits, t_losses, accuracys, num_seqs = self.teacher( data_ids) + self.teacher.train() + + enc_outputs, next_sent_feats, k_i = self.student( + src_ids, + position_ids, + sentence_ids, + flops=flops, + model_size=model_size, + alphas=self._alphas, + k=self._k) # define kd loss kd_losses = [] @@ -140,21 +200,16 @@ class AdaBERTClassifier(Layer): kd_weights = np.exp(kd_weights - np.max(kd_weights)) kd_weights = kd_weights / kd_weights.sum(axis=0) - + s_probs = None for i in range(len(next_sent_feats)): j = int(np.ceil(i * (float(len(t_logits)) / len(next_sent_feats)))) t_logit = t_logits[j] s_sent_feat = next_sent_feats[i] fc = self.cls_fc[i] - s_sent_feat = fluid.layers.dropout( - x=s_sent_feat, - dropout_prob=0.1, - dropout_implementation="upscale_in_train") s_logits = fc(s_sent_feat) - + t_logit.stop_gradient = True t_probs = fluid.layers.softmax(t_logit) s_probs = fluid.layers.softmax(s_logits) - t_probs.stop_gradient = True kd_loss = t_probs * fluid.layers.log(s_probs / T) kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1) kd_loss = kd_loss * kd_weights[i] @@ -167,17 +222,28 @@ class AdaBERTClassifier(Layer): ce_loss = fluid.layers.cross_entropy(s_probs, labels) ce_loss = fluid.layers.reduce_mean(ce_loss) * k_i + len_model_size = len(model_size) # define e loss - model_size = fluid.layers.sum(model_size) - # print("model_size: {}".format(model_size.numpy()/1e6)) + if self._alphas is not None: + flops = np.sum(flops) + model_size = np.sum(model_size) + else: + flops = fluid.layers.sum(flops) + model_size = fluid.layers.sum(model_size) model_size = model_size / self.student.max_model_size() - flops = fluid.layers.sum(flops) / self.student.max_flops() - e_loss = (len(next_sent_feats) * k_i / self._n_layer) * ( - flops + model_size) + flops = flops / self.student.max_flops() + e_loss = (flops + model_size) * (len(next_sent_feats) * k_i / + self._n_layer) + print( + "len(next_sent_feats): {}; k_i: {}; flops: {}; model_size: {}; len: {}". + format( + len(next_sent_feats), k_i, + flops.numpy(), model_size.numpy(), len_model_size)) # define total loss loss = (1 - self._gamma ) * ce_loss - self._gamma * kd_loss + self._beta * e_loss - # print("ce_loss: {}; kd_loss: {}; e_loss: {}".format(( - # 1 - gamma) * ce_loss.numpy(), -gamma * kd_loss.numpy(), beta * - # e_loss.numpy())) return loss, ce_loss, kd_loss, e_loss + + +# loss = ce_loss + self._beta * e_loss +# return loss, ce_loss, ce_loss, e_loss diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py index ce9adbb7d529ff2377838dd9b917800a62605dd2..ea3ce283259caa63754dd574a364d927dd708140 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py @@ -110,10 +110,9 @@ class BertModelLayer(Layer): position_ids, sentence_ids, flops=[], - model_size=[]): - """ - forward - """ + model_size=[], + alphas=None, + k=None): src_emb = self._src_emb(src_ids) pos_emb = self._pos_emb(position_ids) sent_emb = self._sent_emb(sentence_ids) @@ -122,9 +121,8 @@ class BertModelLayer(Layer): emb_out = emb_out + sent_emb emb_out = self._emb_fac(emb_out) - enc_outputs, k_i = self._encoder( - emb_out, flops=flops, model_size=model_size) + emb_out, flops=flops, model_size=model_size, alphas=alphas, k=k) if not self.return_pooled_out: return enc_outputs diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/cls.py b/paddleslim/nas/darts/search_space/conv_bert/model/cls.py index cf809ae23286778f5988a01c1de0072af1366005..c287c091f92f659ba24d9a85d550d651d6b3835b 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/model/cls.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/cls.py @@ -57,12 +57,12 @@ class ClsModelLayer(Layer): fc = Linear( input_dim=self.config["hidden_size"], output_dim=num_labels, - param_attr=fluid.ParamAttr( + param_attr=fluid.paramattr( name="cls_out_%d_w" % i, - initializer=fluid.initializer.TruncatedNormal(scale=0.02)), - bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.truncatednormal(scale=0.02)), + bias_attr=fluid.paramattr( name="cls_out_%d_b" % i, - initializer=fluid.initializer.Constant(0.))) + initializer=fluid.initializer.constant(0.))) fc = self.add_sublayer("cls_fc_%d" % i, fc) self.cls_fc.append(fc) diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py index 895917a3ab8eeb0b7cf34d9c4fbada728aac0121..8481f59a684f4d7c47cc578558f95d315c69594a 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py @@ -18,6 +18,7 @@ from __future__ import division from __future__ import print_function import numpy as np +from collections import Iterable import paddle import paddle.fluid as fluid @@ -25,13 +26,13 @@ from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, Conv2D, Ba from paddle.fluid.initializer import NormalInitializer GConv_PRIMITIVES = [ - 'std_gconv_3', 'std_gconv_5', 'std_gconv_7', 'dil_gconv_3', 'dil_gconv_5', - 'dil_gconv_7', 'avg_pool_3', 'max_pool_3', 'none', 'skip_connect' + 'none', 'std_gconv_3', 'std_gconv_5', 'std_gconv_7', 'dil_gconv_3', + 'dil_gconv_5', 'dil_gconv_7', 'avg_pool_3', 'max_pool_3', 'skip_connect' ] ConvBN_PRIMITIVES = [ - 'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3', - 'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3', 'none', + 'none', 'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3', + 'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3', 'skip_connect' ] @@ -117,7 +118,11 @@ class MixedOp(fluid.dygraph.Layer): def forward(self, x, weights, flops=[], model_size=[]): for i in range(len(self._ops)): - if weights[i].numpy() != 0: + if isinstance(weights, Iterable): + weights_i = weights[i] + else: + weights_i = weights[i].numpy() + if weights_i != 0: flops.append(FLOPs.values()[i] * weights[i]) model_size.append(ModelSize.values()[i] * weights[i]) return self._ops[i](x) * weights[i] @@ -166,6 +171,7 @@ class ConvBNRelu(fluid.dygraph.Layer): use_cudnn=True, name=None): super(ConvBNRelu, self).__init__() + self._name = name conv_std = (2.0 / (filter_size[0] * filter_size[1] * out_c * in_c))**0.5 conv_param = fluid.ParamAttr( @@ -187,7 +193,7 @@ class ConvBNRelu(fluid.dygraph.Layer): def forward(self, inputs): conv = self.conv(inputs) bn = self.bn(conv) - return bn + return conv class GateConv(fluid.dygraph.Layer): @@ -261,24 +267,30 @@ class Cell(fluid.dygraph.Layer): states = [s0, s1] offset = 0 for i in range(self._steps): - s = fluid.layers.sums([ - self._ops[offset + j](h, - weights[offset + j], - flops=flops, - model_size=model_size) - for j, h in enumerate(states) - ]) + edges = [] + for j, h in enumerate(states): + edge = self._ops[offset + j](h, + weights[offset + j], + flops=flops, + model_size=model_size) + edges.append(edge) + s = edges[0] + for n in range(1, len(edges)): + s = s + edges[n] +# s = fluid.layers.sums(edges) offset += len(states) states.append(s) - out = fluid.layers.sum(states[-self._steps:]) + + states = states[-self._steps:] + out = states[0] + for n in range(1, len(states)): + out = out + states[n] + +# out = fluid.layers.sums(states[-self._steps:]) return out class EncoderLayer(Layer): - """ - encoder - """ - def __init__(self, n_layer, hidden_size=768, @@ -342,15 +354,17 @@ class EncoderLayer(Layer): default_initializer=NormalInitializer( loc=0.0, scale=1e-3)) - def forward(self, enc_input, flops=[], model_size=[]): + def forward(self, enc_input, flops=[], model_size=[], alphas=None, k=None): tmp = fluid.layers.reshape( enc_input, [-1, 1, enc_input.shape[1], self._hidden_size]) #(bs, 1, seq_len, hidden_size) tmp = self.conv0(tmp) # (bs, hidden_size, seq_len, 1) - alphas = gumbel_softmax(self.alphas) - k = fluid.layers.reshape(gumbel_softmax(self.k), [-1]) + if alphas is None: + alphas = gumbel_softmax(self.alphas) + if k is None: + k = fluid.layers.reshape(gumbel_softmax(self.k), [-1]) outputs = [] s0 = s1 = tmp @@ -364,7 +378,11 @@ class EncoderLayer(Layer): enc_output, [-1, enc_output.shape[1], self._hidden_size]) # (bs, seq_len, hidden_size) outputs.append(enc_output) - if self._search_layer and k[i].numpy() != 0: + if isinstance(k, Iterable): + k_i = k[i] + else: + k_i = k[i].numpy() + if k_i != 0: outputs[-1] = outputs[-1] * k[i] return outputs, k[i] return outputs, 1.0 diff --git a/paddleslim/nas/darts/train_search.py b/paddleslim/nas/darts/train_search.py index cf832f8b856e10c8cd572e635f57a93fa42d6a2f..1e460cbfa58dc25661b2db55b8cf062e7bd9682d 100644 --- a/paddleslim/nas/darts/train_search.py +++ b/paddleslim/nas/darts/train_search.py @@ -18,10 +18,12 @@ from __future__ import print_function __all__ = ['DARTSearch'] +import math import logging from itertools import izip import numpy as np import paddle.fluid as fluid +from paddle.fluid.framework import Variable from paddle.fluid.dygraph.base import to_variable from ...common import AvgrageMeter, get_logger from .architect import Architect @@ -41,6 +43,7 @@ class DARTSearch(object): model, train_reader, valid_reader, + test_reader=None, learning_rate=0.025, batchsize=64, num_imgs=50000, @@ -54,6 +57,7 @@ class DARTSearch(object): self.model = model self.train_reader = train_reader self.valid_reader = valid_reader + self.test_reader = test_reader self.learning_rate = learning_rate self.batchsize = batchsize self.num_imgs = num_imgs @@ -82,10 +86,13 @@ class DARTSearch(object): step_id = 0 for train_data, valid_data in izip(train_loader(), valid_loader()): if epoch >= self.epochs_no_archopt: - architect.step(train_data, valid_data) + alphas_grad = architect.step(train_data, valid_data) loss, ce_loss, kd_loss, e_loss = self.model.loss(train_data) - + if math.isnan(e_loss.numpy()): + print("alphas_grad: {}".format(alphas_grad)) + print("alphas: {}".format(self.model.arch_parameters()[0] + .numpy())) if self.use_data_parallel: loss = self.model.scale_loss(loss) loss.backward() @@ -93,67 +100,55 @@ class DARTSearch(object): else: loss.backward() - grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5) - optimizer.minimize(loss, grad_clip) +# grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5) +# optimizer.minimize(loss, grad_clip) + optimizer.minimize(loss) + self.model.clear_gradients() batch_size = train_data[0].shape[0] objs.update(loss.numpy(), batch_size) + + e_loss = e_loss.numpy() if isinstance(e_loss, Variable) else e_loss ce_losses.update(ce_loss.numpy(), batch_size) kd_losses.update(kd_loss.numpy(), batch_size) - e_losses.update(e_loss.numpy(), batch_size) + e_losses.update(e_loss, batch_size) if step_id % self.log_freq == 0: - #logger.info("Train Epoch {}, Step {}, loss {:.6f}; ce: {:.6f}; kd: {:.6f}; e: {:.6f}".format( - # epoch, step_id, objs.avg[0], ce_losses.avg[0], kd_losses.avg[0], e_losses.avg[0])) logger.info( "Train Epoch {}, Step {}, loss {}; ce: {}; kd: {}; e: {}". format(epoch, step_id, loss.numpy(), - ce_loss.numpy(), kd_loss.numpy(), e_loss.numpy())) + ce_loss.numpy(), kd_loss.numpy(), e_loss)) step_id += 1 return objs.avg[0] def valid_one_epoch(self, valid_loader, epoch): - objs = AvgrageMeter() - top1 = AvgrageMeter() - top5 = AvgrageMeter() self.model.eval() - + meters = {} for step_id, valid_data in enumerate(valid_loader): - image = to_variable(image) - label = to_variable(label) - n = image.shape[0] - logits = self.model(image) - prec1 = fluid.layers.accuracy(input=logits, label=label, k=1) - prec5 = fluid.layers.accuracy(input=logits, label=label, k=5) - loss = fluid.layers.reduce_mean( - fluid.layers.softmax_with_cross_entropy(logits, label)) - objs.update(loss.numpy(), n) - top1.update(prec1.numpy(), n) - top5.update(prec5.numpy(), n) + ret = self.model.valid(valid_data) + for key, value in ret.items(): + if key not in meters: + meters[key] = AvgrageMeter() + meters[key].update(value, 1) if step_id % self.log_freq == 0: - logger.info( - "Valid Epoch {}, Step {}, loss {:.6f}, acc_1 {:.6f}, acc_5 {:.6f}". - format(epoch, step_id, objs.avg[0], top1.avg[0], top5.avg[ - 0])) - return top1.avg[0] + logger.info("Valid Epoch {}, Step {}, {}".format( + epoch, step_id, meters)) def train(self): if self.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() - model_parameters = [ - p for p in self.model.parameters() - if p.name not in [a.name for a in self.model.arch_parameters()] - ] - logger.info("param size = {:.6f}MB".format( + model_parameters = self.model.model_parameters() + logger.info("parameter size in super net: {:.6f}M".format( count_parameters_in_MB(model_parameters))) step_per_epoch = int(self.num_imgs * 0.5 / self.batchsize) if self.unrolled: step_per_epoch *= 2 learning_rate = fluid.dygraph.CosineDecay( self.learning_rate, step_per_epoch, self.num_epochs) + optimizer = fluid.optimizer.MomentumOptimizer( learning_rate, 0.9, @@ -167,6 +162,9 @@ class DARTSearch(object): self.train_reader) self.valid_reader = fluid.contrib.reader.distributed_batch_reader( self.valid_reader) + if self.test_reader is not None: + self.test_reader = fluid.contrib.reader.distributed_batch_reader( + self.test_reader) train_loader = fluid.io.DataLoader.from_generator( capacity=64, @@ -182,6 +180,17 @@ class DARTSearch(object): train_loader.set_batch_generator(self.train_reader, places=self.place) valid_loader.set_batch_generator(self.valid_reader, places=self.place) + if self.test_reader is not None: + test_loader = fluid.io.DataLoader.from_generator( + capacity=64, + use_double_buffer=True, + iterable=True, + return_list=True) + test_loader.set_batch_generator( + self.test_reader, places=self.place) + else: + test_loader = valid_loader + architect = Architect(self.model, learning_rate, self.arch_learning_rate, self.place, self.unrolled) @@ -199,8 +208,8 @@ class DARTSearch(object): self.train_one_epoch(train_loader, valid_loader, architect, optimizer, epoch) - if epoch == self.num_epochs - 1: - # valid_top1 = self.valid_one_epoch(valid_loader, epoch) - logger.info("Epoch {}, valid_acc {:.6f}".format(epoch, 1)) - if save_parameters: - fluid.save_dygraph(self.model.state_dict(), "./weights") + +# if epoch == self.num_epochs - 1: +# self.valid_one_epoch(test_loader, epoch) +# if save_parameters: +# fluid.save_dygraph(self.model.state_dict(), "./weights") diff --git a/paddleslim/teachers/bert/model/transformer_encoder.py b/paddleslim/teachers/bert/model/transformer_encoder.py index ff6e2b092b48b730238c7515b96f634f6226e597..376c1e18586962fede245e74b6da1ab21a5e2609 100644 --- a/paddleslim/teachers/bert/model/transformer_encoder.py +++ b/paddleslim/teachers/bert/model/transformer_encoder.py @@ -57,7 +57,7 @@ class PrePostProcessLayer(Layer): elif cmd == "d": # add dropout if dropout_rate: self.functors.append(lambda x: fluid.layers.dropout( - x, dropout_prob=dropout_rate, is_test=False)) + x, dropout_prob=dropout_rate, is_test=True)) self.exec_order += "d" def forward(self, x, residual=None): @@ -111,8 +111,8 @@ class PositionwiseFeedForwardLayer(Layer): hidden = fluid.layers.dropout( hidden, dropout_prob=self._dropout_rate, - upscale_in_train="upscale_in_train", - is_test=False) + # upscale_in_train="upscale_in_train", + is_test=True) out = self._h2o(hidden) return out @@ -218,13 +218,13 @@ class MultiHeadAttentionLayer(Layer): #alpha=self._d_model**-0.5) if attn_bias is not None: product += attn_bias - weights = fluid.layers.softmax(product) + weights = fluid.layers.softmax(product) # 48 if self._dropout_rate: weights_droped = fluid.layers.dropout( weights, dropout_prob=self._dropout_rate, - dropout_implementation="upscale_in_train", - is_test=False) + # dropout_implementation="upscale_in_train", + is_test=True) out = fluid.layers.matmul(weights_droped, transpose_v) else: out = fluid.layers.matmul(weights, transpose_v)