diff --git a/demo/bert/train_distill.py b/demo/bert/train_distill.py index 8f4758b05b700a6d39440e6e08bc68913dc4f7e9..c40ae3670fabecdb21ba18e539b9442f823c92e5 100755 --- a/demo/bert/train_distill.py +++ b/demo/bert/train_distill.py @@ -10,26 +10,34 @@ import pickle import logging from paddleslim.common import AvgrageMeter, get_logger +from paddleslim.nas.darts import count_parameters_in_MB + logger = get_logger(__name__, level=logging.INFO) def valid_one_epoch(model, valid_loader, epoch, log_freq): accs = AvgrageMeter() ce_losses = AvgrageMeter() - model.student.eval() + t_accs = AvgrageMeter() + + model.eval() step_id = 0 for valid_data in valid_loader(): try: - loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch) + loss, acc, ce_loss, _, _, t_acc = model._layers.loss(valid_data, + epoch) except: - loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch) + loss, acc, ce_loss, _, _, t_acc = model.loss(valid_data, epoch) batch_size = valid_data[0].shape[0] ce_losses.update(ce_loss.numpy(), batch_size) accs.update(acc.numpy(), batch_size) + t_accs.update(t_acc.numpy(), batch_size) + step_id += 1 - return ce_losses.avg[0], accs.avg[0] + + return ce_losses.avg[0], accs.avg[0], t_accs.avg[0] def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel, @@ -38,18 +46,19 @@ def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel, accs = AvgrageMeter() ce_losses = AvgrageMeter() kd_losses = AvgrageMeter() - model.student.train() + t_accs = AvgrageMeter() + model.train() step_id = 0 for train_data in train_loader(): batch_size = train_data[0].shape[0] if use_data_parallel: - total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss( + total_loss, acc, ce_loss, kd_loss, _, t_acc = model._layers.loss( train_data, epoch) else: - total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data, - epoch) + total_loss, acc, ce_loss, kd_loss, _, t_acc = model.loss( + train_data, epoch) if use_data_parallel: total_loss = model.scale_loss(total_loss) @@ -63,19 +72,23 @@ def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel, accs.update(acc.numpy(), batch_size) ce_losses.update(ce_loss.numpy(), batch_size) kd_losses.update(kd_loss.numpy(), batch_size) + t_accs.update(t_acc.numpy(), batch_size) if step_id % log_freq == 0: logger.info( - "Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f};". + "Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f}, teacher_acc {:.6f};". format(epoch, step_id, - optimizer.current_step_lr(), total_losses.avg[0], - ce_losses.avg[0], kd_losses.avg[0], accs.avg[0])) + optimizer.current_step_lr(), total_losses.avg[ + 0], ce_losses.avg[0], kd_losses.avg[0], accs.avg[0], + t_accs.avg[0])) step_id += 1 + return total_losses.avg[0], accs.avg[0] def main(): # whether use multi-gpus - use_data_parallel = False + device_num = fluid.dygraph.parallel.Env().nranks + use_data_parallel = device_num > 1 place = fluid.CUDAPlace(fluid.dygraph.parallel.Env( ).dev_id) if use_data_parallel else fluid.CUDAPlace(0) @@ -88,12 +101,12 @@ def main(): max_seq_len = 128 batch_size = 192 - hidden_size = 768 + hidden_size = 128 emb_size = 768 epoch = 80 - log_freq = 10 + log_freq = 1 - task_name = 'mnli' + task_name = 'mrpc' if task_name == 'mrpc': data_dir = "./data/glue_data/MRPC/" @@ -110,7 +123,6 @@ def main(): num_labels = 3 processor_func = MnliProcessor - device_num = fluid.dygraph.parallel.Env().nranks use_fixed_gumbel = True train_phase = "train" val_phase = "dev" @@ -129,7 +141,11 @@ def main(): emb_size=emb_size, teacher_model=teacher_model_dir, data_dir=data_dir, - use_fixed_gumbel=use_fixed_gumbel) + use_fixed_gumbel=use_fixed_gumbel, + t=1.0) + + logger.info("param size = {:.6f}MB".format( + count_parameters_in_MB(model.student.parameters()))) learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch) @@ -174,7 +190,8 @@ def main(): capacity=128, use_double_buffer=True, iterable=True, - return_list=True) + return_list=True, + use_multiprocess=True) dev_loader = fluid.io.DataLoader.from_generator( capacity=128, use_double_buffer=True, @@ -190,14 +207,18 @@ def main(): best_valid_acc = 0 for epoch_id in range(epoch): - train_one_epoch(model, train_loader, optimizer, epoch_id, - use_data_parallel, log_freq) - loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq) + total_loss, train_acc = train_one_epoch( + model, train_loader, optimizer, epoch_id, use_data_parallel, + log_freq) + logger.info("train set, total_loss {:.6f}; acc {:.6f};".format( + total_loss, train_acc)) + loss, acc, t_acc = valid_one_epoch(model, dev_loader, epoch_id, + log_freq) if acc > best_valid_acc: best_valid_acc = acc logger.info( - "dev set, ce_loss {:.6f}; acc {:.6f}, best_acc {:.6f};".format( - loss, acc, best_valid_acc)) + "dev set, ce_loss {:.6f}; teacher_acc: {:.6f}, acc {:.6f}, best_acc {:.6f};". + format(loss, t_acc, acc, best_valid_acc)) if __name__ == '__main__': diff --git a/paddleslim/nas/darts/search_space/conv_bert/cls.py b/paddleslim/nas/darts/search_space/conv_bert/cls.py index 3c46c443117399993d22200f8832b1bd1f66dcf2..a201123f2d9d4e4e54edc064e1944e61f9f08408 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/cls.py +++ b/paddleslim/nas/darts/search_space/conv_bert/cls.py @@ -57,7 +57,7 @@ class AdaBERTClassifier(Layer): use_fixed_gumbel=False, gumbel_alphas=None, fix_emb=False, - t=5.0): + t=1.0): super(AdaBERTClassifier, self).__init__() self._n_layer = n_layer self._num_labels = num_labels @@ -78,8 +78,9 @@ class AdaBERTClassifier(Layer): self.teacher = BERTClassifier( num_labels, task_name=task_name, model_path=self._teacher_model) # global setting, will be overwritten when training(about 1% acc loss) - self.teacher.eval() self.teacher.test(self._data_dir) + self.teacher.eval() + print( "----------------------finish load teacher model and test----------------------------------------" ) @@ -116,49 +117,67 @@ class AdaBERTClassifier(Layer): def loss(self, data_ids, epoch): labels = data_ids[4] - s_logits = self.student(data_ids, epoch) + s_logits, s_fea = self.student(data_ids, epoch) + + # make sure techer is compute in eval mode + self.teacher.eval() + t_total_loss, t_logits, t_losses, t_accs, _, t_fea = self.teacher( + data_ids) + if self.student.training: + self.student.train() + t_logits[-1].stop_gradient = True + + #kd_loss = fluid.layers.mse_loss(s_logits[-1], t_logits[-1]) + #kd_loss = fluid.layers.mse_loss(s_fea, t_fea) - t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids) + #kd_loss = fluid.layers.reduce_sum(fluid.layers.square(s_logits[-1] - t_logits[-1])) + + t_probs = fluid.layers.softmax(t_logits[-1] / self.T) + s_probs = fluid.layers.softmax(s_logits[-1] / self.T) + kd_loss = fluid.layers.reduce_mean( + fluid.layers.cross_entropy( + input=s_probs, label=t_probs, soft_label=True)) #define kd loss - kd_weights = [] - for i in range(len(s_logits)): - j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) - kd_weights.append(t_losses[j].numpy()) - - kd_weights = np.array(kd_weights) - kd_weights = np.squeeze(kd_weights) - kd_weights = to_variable(kd_weights) - kd_weights = fluid.layers.softmax(-kd_weights) - - kd_losses = [] - for i in range(len(s_logits)): - j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) - t_logit = t_logits[j] - s_logit = s_logits[i] - t_logit.stop_gradient = True - t_probs = fluid.layers.softmax(t_logit) # P_j^T - s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S - #kd_loss = -t_probs * fluid.layers.log(s_probs) - kd_loss = fluid.layers.cross_entropy( - input=s_probs, label=t_probs, soft_label=True) - kd_loss = fluid.layers.reduce_mean(kd_loss) - kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i]) - kd_losses.append(kd_loss) - kd_loss = fluid.layers.sum(kd_losses) + # kd_weights = [] + # for i in range(len(s_logits)): + # j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) + # kd_weights.append(t_losses[j].numpy()) + + # kd_weights = np.array(kd_weights) + # kd_weights = np.squeeze(kd_weights) + # kd_weights = to_variable(kd_weights) + # kd_weights = fluid.layers.softmax(-kd_weights) + + # kd_losses = [] + # for i in range(len(s_logits)): + # j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) + # t_logit = t_logits[j] + # s_logit = s_logits[i] + # t_logit.stop_gradient = True + # t_probs = fluid.layers.softmax(t_logit) # P_j^T + # s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S + # #kd_loss = -t_probs * fluid.layers.log(s_probs) + # kd_loss = fluid.layers.cross_entropy( + # input=s_probs, label=t_probs, soft_label=True) + # kd_loss = fluid.layers.reduce_mean(kd_loss) + # kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i]) + # kd_losses.append(kd_loss) + # kd_loss = fluid.layers.sum(kd_losses) losses = [] - for logit in s_logits: + for logit in [s_logits[-1]]: ce_loss, probs = fluid.layers.softmax_with_cross_entropy( logits=logit, label=labels, return_softmax=True) + #print("training: ", self.student.training, probs.numpy()) + loss = fluid.layers.mean(x=ce_loss) losses.append(loss) - num_seqs = fluid.layers.create_tensor(dtype='int64') - accuracy = fluid.layers.accuracy( - input=probs, label=labels, total=num_seqs) + accuracy = fluid.layers.accuracy(input=probs, label=labels) ce_loss = fluid.layers.sum(losses) - total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss + #total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss + total_loss = kd_loss - return total_loss, accuracy, ce_loss, kd_loss, s_logits + return total_loss, accuracy, ce_loss, kd_loss, s_logits, t_accs[-1] diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py index 38de45e6d88058800b4880d7d2354f5a2b5605f9..c31f5fd93c3a2b1f9b3ab4a64fb7115bd49584d7 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py @@ -91,6 +91,11 @@ class BertModelLayer(Layer): output_dim=self._hidden_size, param_attr=fluid.ParamAttr(name="s_emb_factorization")) + self._emb_fac_1 = Linear( + input_dim=self._emb_size, + output_dim=self._hidden_size, + param_attr=fluid.ParamAttr(name="s_emb_factorization_1")) + self._encoder = EncoderLayer( num_labels=num_labels, n_layer=self._n_layer, @@ -103,10 +108,6 @@ class BertModelLayer(Layer): return self._src_emb.parameters() + self._pos_emb.parameters( ) + self._sent_emb.parameters() - def emb_names(self): - return self._src_emb.parameters() + self._pos_emb.parameters( - ) + self._sent_emb.parameters() - def max_flops(self): return self._encoder.max_flops @@ -129,6 +130,6 @@ class BertModelLayer(Layer): emb_out_1 = self._emb_fac(src_emb_1) # (bs, seq_len, hidden_size) - enc_outputs = self._encoder(emb_out_0, emb_out_1, epoch) + enc_outputs, fea = self._encoder(emb_out_0, emb_out_1, epoch) - return enc_outputs + return enc_outputs, fea diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py index bad9a1e266e04022543269b32f3b45e709d97451..6d4ea2491865c8f41707d959c09bee2e86fa995a 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py @@ -45,8 +45,8 @@ OPS = { 'avg_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='avg'), 'max_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='max'), - 'none': lambda n_channel, name: Zero(), 'skip_connect': lambda n_channel, name: Identity(), + 'none': lambda n_channel, name: Zero(), } @@ -61,10 +61,10 @@ class MixedOp(fluid.dygraph.Layer): if 'pool' in primitive: gama = ParamAttr( initializer=fluid.initializer.Constant(value=1), - trainable=False) + trainable=True) beta = ParamAttr( initializer=fluid.initializer.Constant(value=0), - trainable=False) + trainable=True) BN = BatchNorm(n_channel, param_attr=gama, bias_attr=beta) op = fluid.dygraph.Sequential(op, BN) ops.append(op) @@ -125,7 +125,7 @@ class ReluConvBN(fluid.dygraph.Layer): filter_size=[3, 1], dilation=1, stride=1, - affine=False, + affine=True, use_cudnn=True, name=None): super(ReluConvBN, self).__init__() @@ -210,40 +210,40 @@ class EncoderLayer(Layer): super(EncoderLayer, self).__init__() self._n_layer = n_layer self._hidden_size = hidden_size - self._n_channel = 128 + self._n_channel = hidden_size self._steps = 3 self._n_ops = len(ConvBN_PRIMITIVES) self.use_fixed_gumbel = use_fixed_gumbel - self.stem0 = fluid.dygraph.Sequential( - Conv2D( - num_channels=1, - num_filters=self._n_channel, - filter_size=[3, self._hidden_size], - padding=[1, 0], - param_attr=fluid.ParamAttr(initializer=MSRA()), - bias_attr=False), - BatchNorm( - num_channels=self._n_channel, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1)), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0)))) - - self.stem1 = fluid.dygraph.Sequential( - Conv2D( - num_channels=1, - num_filters=self._n_channel, - filter_size=[3, self._hidden_size], - padding=[1, 0], - param_attr=fluid.ParamAttr(initializer=MSRA()), - bias_attr=False), - BatchNorm( - num_channels=self._n_channel, - param_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=1)), - bias_attr=fluid.ParamAttr( - initializer=fluid.initializer.Constant(value=0)))) + # self.stem0 = fluid.dygraph.Sequential( + # Conv2D( + # num_channels=1, + # num_filters=self._n_channel, + # filter_size=[3, self._hidden_size], + # padding=[1, 0], + # param_attr=fluid.ParamAttr(initializer=MSRA()), + # bias_attr=False), + # BatchNorm( + # num_channels=self._n_channel, + # param_attr=fluid.ParamAttr( + # initializer=fluid.initializer.Constant(value=1)), + # bias_attr=fluid.ParamAttr( + # initializer=fluid.initializer.Constant(value=0)))) + + # self.stem1 = fluid.dygraph.Sequential( + # Conv2D( + # num_channels=1, + # num_filters=self._n_channel, + # filter_size=[3, self._hidden_size], + # padding=[1, 0], + # param_attr=fluid.ParamAttr(initializer=MSRA()), + # bias_attr=False), + # BatchNorm( + # num_channels=self._n_channel, + # param_attr=fluid.ParamAttr( + # initializer=fluid.initializer.Constant(value=1)), + # bias_attr=fluid.ParamAttr( + # initializer=fluid.initializer.Constant(value=0)))) cells = [] for i in range(n_layer): @@ -271,10 +271,10 @@ class EncoderLayer(Layer): num_channels=self._n_channel, param_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=1), - trainable=False), + trainable=True), bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0), - trainable=False)) + trainable=True)) out = Linear( self._n_channel, num_labels, @@ -311,17 +311,28 @@ class EncoderLayer(Layer): s0 = fluid.layers.unsqueeze(enc_input_0, [1]) s1 = fluid.layers.unsqueeze(enc_input_1, [1]) - s0 = self.stem0(s0) - s1 = self.stem1(s1) + s0 = fluid.layers.transpose(s0, [0, 3, 2, 1]) + s1 = fluid.layers.transpose(s1, [0, 3, 2, 1]) + + # s0 = self.stem0(s0) + # s1 = self.stem1(s1) enc_outputs = [] + fea = [] + for i in range(self._n_layer): s0, s1 = s1, self._cells[i](s0, s1, alphas) # (bs, n_channel, seq_len, 1) tmp = self._bns[i](s1) + tmp = s1 tmp = self.pool2d_avg(tmp) tmp = fluid.layers.reshape(tmp, shape=[-1, 0]) + fea.append(tmp) + tmp = fluid.layers.dropout( + x=tmp, + dropout_prob=0.1, + dropout_implementation="upscale_in_train") tmp = self._outs[i](tmp) enc_outputs.append(tmp) - return enc_outputs + return enc_outputs, fea[-1] diff --git a/paddleslim/teachers/bert/cls.py b/paddleslim/teachers/bert/cls.py index 8f7e1a4b0775c27f9623dfc3f98dd832a4925cf7..4fb975a8437c4c44f0f9410b5645cb8be2a6c322 100755 --- a/paddleslim/teachers/bert/cls.py +++ b/paddleslim/teachers/bert/cls.py @@ -120,12 +120,16 @@ class BERTClassifier(Layer): test_data_generator = processor.data_generator( batch_size=batch_size, phase='dev', epoch=1, shuffle=False) + # test train mode test_acc self.cls_model.eval() + print("test with test mode:...") + total_cost, final_acc, avg_acc, total_num_seqs = [], [], [], [] for batch in test_data_generator(): data_ids = create_data(batch) - total_loss, _, _, np_acces, np_num_seqs = self.cls_model(data_ids) + total_loss, _, _, np_acces, np_num_seqs, fea = self.cls_model( + data_ids) np_loss = total_loss.numpy() np_acc = np_acces[-1].numpy() diff --git a/paddleslim/teachers/bert/model/cls.py b/paddleslim/teachers/bert/model/cls.py index bdfef8b5b4d6c6133e176a146f170bbb633701aa..425f2981c35ff8d0a9a81419bf00daa1bbfe00fe 100644 --- a/paddleslim/teachers/bert/model/cls.py +++ b/paddleslim/teachers/bert/model/cls.py @@ -118,4 +118,5 @@ class ClsModelLayer(Layer): accuracys.append(accuracy) total_loss = fluid.layers.sum(losses) - return total_loss, logits, losses, accuracys, num_seqs + return total_loss, logits, losses, accuracys, num_seqs, next_sent_feat[ + -1]