From ec96bbf2bc8471df0b4577d148cf7b6f5ce68ee0 Mon Sep 17 00:00:00 2001 From: baiyfbupt Date: Sun, 28 Jun 2020 11:19:54 +0800 Subject: [PATCH] adabert multi-gpus --- demo/bert/search_bert.py | 56 ---- demo/bert/train_cell_base.py | 256 +++++++++++------- paddleslim/dist/dml.py | 3 +- paddleslim/nas/darts/architect_for_bert.py | 8 +- .../nas/darts/search_space/conv_bert/cls.py | 112 +++++--- .../search_space/conv_bert/model/bert.py | 19 +- .../conv_bert/model/transformer_encoder.py | 147 +++++----- paddleslim/teachers/bert/cls.py | 8 +- paddleslim/teachers/bert/model/bert.py | 4 + paddleslim/teachers/bert/model/cls.py | 18 +- paddleslim/teachers/bert/reader/cls.py | 23 +- 11 files changed, 377 insertions(+), 277 deletions(-) delete mode 100644 demo/bert/search_bert.py diff --git a/demo/bert/search_bert.py b/demo/bert/search_bert.py deleted file mode 100644 index a97739f3..00000000 --- a/demo/bert/search_bert.py +++ /dev/null @@ -1,56 +0,0 @@ -import paddle.fluid as fluid -from paddleslim.teachers.bert.reader.cls import * -from paddleslim.nas.darts.search_space import AdaBERTClassifier -from paddleslim.nas.darts import DARTSearch - - -def main(): - place = fluid.CUDAPlace(0) - - BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/" - bert_config_path = BERT_BASE_PATH + "/bert_config.json" - vocab_path = BERT_BASE_PATH + "/vocab.txt" - data_dir = "./data/glue_data/MNLI/" - max_seq_len = 512 - do_lower_case = True - batch_size = 32 - epoch = 30 - - processor = MnliProcessor( - data_dir=data_dir, - vocab_path=vocab_path, - max_seq_len=max_seq_len, - do_lower_case=do_lower_case, - in_tokens=False) - - train_reader = processor.data_generator( - batch_size=batch_size, - phase='train', - epoch=epoch, - dev_count=1, - shuffle=True) - - val_reader = processor.data_generator( - batch_size=batch_size, - phase='train', - epoch=epoch, - dev_count=1, - shuffle=True) - - with fluid.dygraph.guard(place): - model = AdaBERTClassifier( - 3, - teacher_model="/work/PaddleSlim/demo/bert_1/checkpoints/steps_23000" - ) - searcher = DARTSearch( - model, - train_reader, - val_reader, - batchsize=batch_size, - num_epochs=epoch, - log_freq=10) - searcher.train() - - -if __name__ == '__main__': - main() diff --git a/demo/bert/train_cell_base.py b/demo/bert/train_cell_base.py index 34b28c11..93a7f70f 100755 --- a/demo/bert/train_cell_base.py +++ b/demo/bert/train_cell_base.py @@ -3,7 +3,6 @@ from itertools import izip import paddle.fluid as fluid from paddleslim.teachers.bert.reader.cls import * from paddleslim.nas.darts.search_space import AdaBERTClassifier -from paddleslim.nas.darts.architect_for_bert import Architect import logging from paddleslim.common import AvgrageMeter, get_logger @@ -18,79 +17,94 @@ def count_parameters_in_MB(all_params): return parameters_number / 1e6 -def model_loss(model, data_ids): - # src_ids = data_ids[0] - # position_ids = data_ids[1] - # sentence_ids = data_ids[2] - # input_mask = data_ids[3] - labels = data_ids[4] - labels.stop_gradient = True - - enc_output = model(data_ids) - - ce_loss, probs = fluid.layers.softmax_with_cross_entropy( - logits=enc_output, label=labels, return_softmax=True) - loss = fluid.layers.mean(x=ce_loss) - num_seqs = fluid.layers.create_tensor(dtype='int64') - accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) - return loss, accuracy - - -def train_one_epoch(model, architect, train_loader, valid_loader, optimizer, - epoch, use_data_parallel, log_freq): - ce_losses = AvgrageMeter() +def train_one_epoch(model, train_loader, valid_loader, optimizer, + arch_optimizer, epoch, use_data_parallel, log_freq): + total_losses = AvgrageMeter() accs = AvgrageMeter() + ce_losses = AvgrageMeter() + kd_losses = AvgrageMeter() + val_accs = AvgrageMeter() model.train() step_id = 0 - for train_data, valid_data in izip(train_loader(), valid_loader): - architect.step(train_data, valid_data) - loss, acc = model_loss(model, train_data) + for train_data, valid_data in izip(train_loader(), valid_loader()): + #for train_data in train_loader(): + batch_size = train_data[0].shape[0] + + try: + total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss( + train_data, epoch) + except: + total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data, + epoch) if use_data_parallel: - loss = model.scale_loss(loss) - loss.backward() + total_loss = model.scale_loss(total_loss) + total_loss.backward() model.apply_collective_grads() else: - loss.backward() - - optimizer.minimize(loss) + total_loss.backward() + optimizer.minimize(total_loss) model.clear_gradients() - - batch_size = train_data[0].shape[0] - ce_losses.update(loss.numpy(), batch_size) + total_losses.update(total_loss.numpy(), batch_size) accs.update(acc.numpy(), batch_size) + ce_losses.update(ce_loss.numpy(), batch_size) + kd_losses.update(kd_loss.numpy(), batch_size) + + try: + arch_loss, _, _, _, arch_logits = model._layers.loss(valid_data, + epoch) + except: + arch_loss, _, _, _, arch_logits = model.loss(valid_data, epoch) + + if use_data_parallel: + arch_loss = model.scale_loss(arch_loss) + arch_loss.backward() + model.apply_collective_grads() + else: + arch_loss.backward() + arch_optimizer.minimize(arch_loss) + arch_optimizer.clear_gradients() + probs = fluid.layers.softmax(arch_logits[-1]) + val_acc = fluid.layers.accuracy(input=probs, label=valid_data[4]) + val_accs.update(val_acc.numpy(), batch_size) if step_id % log_freq == 0: logger.info( - "Train Epoch {}, Step {}, Lr {:.6f} loss {:.6f}; acc: {:.6f};". + "Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f}, valid_acc {:.6f};". format(epoch, step_id, - optimizer.current_step_lr(), ce_losses.avg[0], accs.avg[ - 0])) + optimizer.current_step_lr(), total_losses.avg[ + 0], ce_losses.avg[0], kd_losses.avg[0], accs.avg[0], + val_accs.avg[0])) step_id += 1 def valid_one_epoch(model, valid_loader, epoch, log_freq): - ce_losses = AvgrageMeter() accs = AvgrageMeter() + ce_losses = AvgrageMeter() model.eval() step_id = 0 for valid_data in valid_loader(): - loss, acc = model_loss(model, valid_data) + try: + loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch) + except: + loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch) batch_size = valid_data[0].shape[0] - ce_losses.update(loss.numpy(), batch_size) + ce_losses.update(ce_loss.numpy(), batch_size) accs.update(acc.numpy(), batch_size) - if step_id % log_freq == 0: - logger.info("Valid Epoch {}, Step {}, loss {:.6f}; acc: {:.6f};". - format(epoch, step_id, ce_losses.avg[0], accs.avg[0])) + # if step_id % log_freq == 0: + # logger.info("Valid Epoch {}, Step {}, ce_loss {:.6f}; acc: {:.6f};". + # format(epoch, step_id, ce_losses.avg[0], accs.avg[0])) step_id += 1 + return ce_losses.avg[0], accs.avg[0] def main(): - use_data_parallel = False + # whether use multi-gpus + use_data_parallel = True place = fluid.CUDAPlace(fluid.dygraph.parallel.Env( ).dev_id) if use_data_parallel else fluid.CUDAPlace(0) @@ -98,44 +112,31 @@ def main(): bert_config_path = BERT_BASE_PATH + "/bert_config.json" vocab_path = BERT_BASE_PATH + "/vocab.txt" data_dir = "./data/glue_data/MNLI/" - teacher_model_dir = "./teacher_model/steps_23000" - num_samples = 392702 - max_seq_len = 128 + teacher_model_dir = "./data/teacher_model/steps_23000" do_lower_case = True - batch_size = 128 + #num_samples = 392702 + num_samples = 8016987 + max_seq_len = 128 + batch_size = 64 hidden_size = 768 emb_size = 768 max_layer = 8 epoch = 80 log_freq = 10 - use_fixed_gumbel = True - - processor = MnliProcessor( - data_dir=data_dir, - vocab_path=vocab_path, - max_seq_len=max_seq_len, - do_lower_case=do_lower_case, - in_tokens=False) - - train_reader = processor.data_generator( - batch_size=batch_size, - phase='search_train', - epoch=1, - dev_count=1, - shuffle=True) - - val_reader = processor.data_generator( - batch_size=batch_size, - phase='search_valid', - epoch=1, - dev_count=1, - shuffle=True) - - if use_data_parallel: - train_reader = fluid.contrib.reader.distributed_batch_reader( - train_reader) - valid_reader = fluid.contrib.reader.distributed_batch_reader( - valid_reader) + + device_num = fluid.dygraph.parallel.Env().nranks + search = True + + if search: + use_fixed_gumbel = False + train_phase = "search_train" + val_phase = "search_valid" + step_per_epoch = int(num_samples / ((batch_size * 0.5) * device_num)) + else: + use_fixed_gumbel = True + train_phase = "train" + val_phase = "dev" + step_per_epoch = int(num_samples / (batch_size * device_num)) with fluid.dygraph.guard(place): model = AdaBERTClassifier( @@ -147,47 +148,106 @@ def main(): data_dir=data_dir, use_fixed_gumbel=use_fixed_gumbel) - if use_data_parallel: - strategy = fluid.dygraph.parallel.prepare_context() - model = fluid.dygraph.parallel.DataParallel(model, strategy) - - device_num = fluid.dygraph.parallel.Env().nranks - step_per_epoch = int(num_samples / (batch_size * device_num)) learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch) - model_parameters = [ - p for p in model.parameters() - if p.name not in [a.name for a in model.arch_parameters()] - ] + model_parameters = [] + for p in model.parameters(): + if (p.name not in [a.name for a in model.arch_parameters()] and + p.name not in + [a.name for a in model.teacher.parameters()]): + model_parameters.append(p) - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0) + #clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0) optimizer = fluid.optimizer.MomentumOptimizer( learning_rate, 0.9, regularization=fluid.regularizer.L2DecayRegularizer(3e-4), - parameter_list=model_parameters, - grad_clip=clip) + parameter_list=model_parameters) + # grad_clip=clip) + + arch_optimizer = fluid.optimizer.Adam( + 3e-4, + 0.5, + 0.999, + regularization=fluid.regularizer.L2Decay(1e-3), + parameter_list=model.arch_parameters()) + + processor = MnliProcessor( + data_dir=data_dir, + vocab_path=vocab_path, + max_seq_len=max_seq_len, + do_lower_case=do_lower_case, + in_tokens=False) + + train_reader = processor.data_generator( + batch_size=batch_size, + phase=train_phase, + epoch=1, + dev_count=1, + shuffle=True) + + valid_reader = processor.data_generator( + batch_size=batch_size, + phase=val_phase, + epoch=1, + dev_count=1, + shuffle=True) + print("train_data nums:", processor.get_num_examples(train_phase)) + print("valid_data nums:", processor.get_num_examples(val_phase)) + print("dev_data nums:", processor.get_num_examples("dev")) + + if use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + valid_reader = fluid.contrib.reader.distributed_batch_reader( + valid_reader) + + dev_reader = processor.data_generator( + batch_size=batch_size, + phase="dev", + epoch=1, + dev_count=1, + shuffle=False) train_loader = fluid.io.DataLoader.from_generator( - capacity=1024, + capacity=512, use_double_buffer=True, iterable=True, - return_list=True) + return_list=True, + use_multiprocess=True) valid_loader = fluid.io.DataLoader.from_generator( - capacity=1024, + capacity=512, + use_double_buffer=True, + iterable=True, + return_list=True, + use_multiprocess=True) + dev_loader = fluid.io.DataLoader.from_generator( + capacity=512, use_double_buffer=True, iterable=True, - return_list=True) + return_list=True, + use_multiprocess=True) + train_loader.set_batch_generator(train_reader, places=place) - valid_loader.set_batch_generator(val_reader, places=place) + valid_loader.set_batch_generator(valid_reader, places=place) + dev_loader.set_batch_generator(dev_reader, places=place) - architect = Architect(model, learning_rate, 3e-4, place, False) + if use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + model = fluid.dygraph.parallel.DataParallel(model, strategy) for epoch_id in range(epoch): - train_one_epoch(model, architect, train_loader, valid_loader, - optimizer, epoch_id, use_data_parallel, log_freq) - valid_one_epoch(model, valid_loader, epoch_id, log_freq) - print(model.student._encoder.alphas.numpy()) + train_one_epoch(model, train_loader, valid_loader, optimizer, + arch_optimizer, epoch_id, use_data_parallel, + log_freq) + loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq) + logger.info("Valid set2, ce_loss {:.6f}; acc: {:.6f};".format(loss, + acc)) + + try: + print(model.student._encoder.alphas.numpy()) + except: + print(model._layers.student._encoder.alphas.numpy()) print("=" * 100) diff --git a/paddleslim/dist/dml.py b/paddleslim/dist/dml.py index 34ae1deb..2ad1c942 100755 --- a/paddleslim/dist/dml.py +++ b/paddleslim/dist/dml.py @@ -18,7 +18,6 @@ from __future__ import print_function import copy import paddle.fluid as fluid -import paddle.nn.functional as F class DML(fluid.dygraph.Layer): @@ -70,7 +69,7 @@ class DML(fluid.dygraph.Layer): cur_kl_loss = 0 for j in range(self.model_num): if i != j: - x = F.log_softmax(logits[i], axis=1) + x = fluid.layers.log_softmax(logits[i], axis=1) y = fluid.layers.softmax(logits[j], axis=1) cur_kl_loss += fluid.layers.kldiv_loss( x, y, reduction='batchmean') diff --git a/paddleslim/nas/darts/architect_for_bert.py b/paddleslim/nas/darts/architect_for_bert.py index 2c4f2645..b1f6df26 100644 --- a/paddleslim/nas/darts/architect_for_bert.py +++ b/paddleslim/nas/darts/architect_for_bert.py @@ -49,17 +49,17 @@ class Architect(object): self.network_weight_decay), parameter_list=self.unrolled_model_params) - def step(self, train_data, valid_data): + def step(self, train_data, valid_data, epoch): if self.unrolled: params_grads = self._backward_step_unrolled(train_data, valid_data) self.optimizer.apply_gradients(params_grads) else: - loss = self._backward_step(valid_data) + loss = self._backward_step(valid_data, epoch) self.optimizer.minimize(loss) self.optimizer.clear_gradients() - def _backward_step(self, valid_data): - loss = self.model.loss(valid_data) + def _backward_step(self, valid_data, epoch): + loss = self.model.loss(valid_data, epoch) loss[0].backward() return loss[0] diff --git a/paddleslim/nas/darts/search_space/conv_bert/cls.py b/paddleslim/nas/darts/search_space/conv_bert/cls.py index 64a076f6..8a807fa2 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/cls.py +++ b/paddleslim/nas/darts/search_space/conv_bert/cls.py @@ -31,6 +31,7 @@ import multiprocessing import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import to_variable, Layer, Linear +from paddle.fluid.dygraph.base import to_variable from .reader.cls import * from .model.bert import BertModelLayer from .optimization import Optimizer @@ -65,14 +66,17 @@ class AdaBERTClassifier(Layer): self._teacher_model = teacher_model self._data_dir = data_dir self.use_fixed_gumbel = use_fixed_gumbel - #print( - # "----------------------load teacher model and test----------------------------------------" - #) - #self.teacher = BERTClassifier(num_labels, model_path=self._teacher_model) + self.T = 1.0 + print( + "----------------------load teacher model and test----------------------------------------" + ) + self.teacher = BERTClassifier( + num_labels, model_path=self._teacher_model) + self.teacher.eval() #self.teacher.test(self._data_dir) - #print( - # "----------------------finish load teacher model and test----------------------------------------" - #) + print( + "----------------------finish load teacher model and test----------------------------------------" + ) self.student = BertModelLayer( n_layer=self._n_layer, emb_size=self._emb_size, @@ -81,46 +85,84 @@ class AdaBERTClassifier(Layer): search_layer=self._search_layer, use_fixed_gumbel=self.use_fixed_gumbel) - self.cls_fc = list() - for i in range(self._n_layer): - fc = Linear( - input_dim=self._hidden_size, - output_dim=self._num_labels, - param_attr=fluid.ParamAttr( - name="s_cls_out_%d_w" % i, - initializer=fluid.initializer.TruncatedNormal(scale=0.02)), - bias_attr=fluid.ParamAttr( - name="s_cls_out_%d_b" % i, - initializer=fluid.initializer.Constant(0.))) - fc = self.add_sublayer("cls_fc_%d" % i, fc) - self.cls_fc.append(fc) - - def forward(self, data_ids): + fix_emb = False + for s_emb, t_emb in zip(self.student.emb_names(), + self.teacher.emb_names()): + t_emb.stop_gradient = True + if fix_emb: + s_emb.stop_gradient = True + print( + "Assigning embedding[{}] from teacher to embedding[{}] in student.". + format(t_emb.name, s_emb.name)) + fluid.layers.assign(input=t_emb, output=s_emb) + print( + "Assigned embedding[{}] from teacher to embedding[{}] in student.". + format(t_emb.name, s_emb.name)) + + def forward(self, data_ids, epoch): src_ids = data_ids[0] position_ids = data_ids[1] sentence_ids = data_ids[2] - return self.student(src_ids, position_ids, sentence_ids) + return self.student(src_ids, position_ids, sentence_ids, epoch) def arch_parameters(self): return self.student.arch_parameters() - def genotype(self): - return self.arch_parameters() + def ce(self, logits): + logits = np.exp(logits - np.max(logits)) + logits = logits / logits.sum(axis=0) + return logits - def loss(self, data_ids): + def loss(self, data_ids, epoch): src_ids = data_ids[0] position_ids = data_ids[1] sentence_ids = data_ids[2] input_mask = data_ids[3] labels = data_ids[4] - enc_output = self.student( - src_ids, position_ids, sentence_ids, flops=[], model_size=[]) + s_logits = self.student(src_ids, position_ids, sentence_ids, epoch) + + t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids) + + #define kd loss + kd_weights = [] + for i in range(len(s_logits)): + j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) + kd_weights.append(t_losses[j].numpy()) + + kd_weights = np.array(kd_weights) + kd_weights = np.squeeze(kd_weights) + kd_weights = to_variable(kd_weights) + kd_weights = fluid.layers.softmax(-kd_weights) + + kd_losses = [] + for i in range(len(s_logits)): + j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) + t_logit = t_logits[j] + s_logit = s_logits[i] + t_logit.stop_gradient = True + t_probs = fluid.layers.softmax(t_logit) # P_j^T + s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S + #kd_loss = -t_probs * fluid.layers.log(s_probs) + kd_loss = fluid.layers.cross_entropy( + input=s_probs, label=t_probs, soft_label=True) + kd_loss = fluid.layers.reduce_mean(kd_loss) + kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i]) + kd_losses.append(kd_loss) + kd_loss = fluid.layers.sum(kd_losses) + + losses = [] + for logit in s_logits: + ce_loss, probs = fluid.layers.softmax_with_cross_entropy( + logits=logit, label=labels, return_softmax=True) + loss = fluid.layers.mean(x=ce_loss) + losses.append(loss) + + num_seqs = fluid.layers.create_tensor(dtype='int64') + accuracy = fluid.layers.accuracy( + input=probs, label=labels, total=num_seqs) + ce_loss = fluid.layers.sum(losses) + + total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss - ce_loss, probs = fluid.layers.softmax_with_cross_entropy( - logits=enc_output, label=labels, return_softmax=True) - loss = fluid.layers.mean(x=ce_loss) - num_seqs = fluid.layers.create_tensor(dtype='int64') - accuracy = fluid.layers.accuracy( - input=probs, label=labels, total=num_seqs) - return loss, accuracy + return total_loss, accuracy, ce_loss, kd_loss, s_logits diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py index 2a4e4e3f..405b9516 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py @@ -56,9 +56,9 @@ class BertModelLayer(Layer): self.use_fixed_gumbel = use_fixed_gumbel - self._word_emb_name = "word_embedding" - self._pos_emb_name = "pos_embedding" - self._sent_emb_name = "sent_embedding" + self._word_emb_name = "s_word_embedding" + self._pos_emb_name = "s_pos_embedding" + self._sent_emb_name = "s_sent_embedding" self._dtype = "float16" if use_fp16 else "float32" self._conv_type = conv_type @@ -95,6 +95,10 @@ class BertModelLayer(Layer): search_layer=self._search_layer, use_fixed_gumbel=self.use_fixed_gumbel) + def emb_names(self): + return self._src_emb.parameters() + self._pos_emb.parameters( + ) + self._sent_emb.parameters() + def max_flops(self): return self._encoder.max_flops @@ -108,6 +112,7 @@ class BertModelLayer(Layer): src_ids, position_ids, sentence_ids, + epoch, flops=[], model_size=[]): """ @@ -150,9 +155,9 @@ class BertModelLayer(Layer): src_emb_1 = self._src_emb(ids1) emb_out_0 = self._emb_fac(src_emb_0) emb_out_1 = self._emb_fac(src_emb_1) - # (bs, seq_len, 768) + # (bs, seq_len, hidden_size) - enc_output = self._encoder( - emb_out_0, emb_out_1, flops=flops, model_size=model_size) + enc_outputs = self._encoder( + emb_out_0, emb_out_1, epoch, flops=flops, model_size=model_size) - return enc_output + return enc_outputs diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py index 19528300..951e2749 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py @@ -52,11 +52,6 @@ class MixedOp(fluid.dygraph.Layer): def __init__(self, n_channel, name=None): super(MixedOp, self).__init__() PRIMITIVES = ConvBN_PRIMITIVES - # ops = [ - # OPS[primitive](n_channel, name - # if name is None else name + "/" + primitive) - # for primitive in PRIMITIVES - # ] ops = [] for primitive in PRIMITIVES: op = OPS[primitive](n_channel, name @@ -74,21 +69,19 @@ class MixedOp(fluid.dygraph.Layer): self._ops = fluid.dygraph.LayerList(ops) - def forward(self, x, weights): - #out = weights[0] * self._ops[0](x) - # out = fluid.layers.sums( - # [weights[i] * op(x) for i, op in enumerate(self._ops)]) - # return out + def forward(self, x, weights, index): + out = fluid.layers.sums( + [weights[i] * op(x) for i, op in enumerate(self._ops)]) + return out - for i in range(len(self._ops)): - if weights[i].numpy() != 0: - return self._ops[i](x) * weights[i] + # causebug in multi-gpus + #for i in range(len(self._ops)): + # if weights[i].numpy() != 0: + # return self._ops[i](x) * weights[i] -def gumbel_softmax(logits, temperature=1, hard=True, eps=1e-10): - #U = np.random.uniform(0, 1, logits.shape) - #U = - to_variable( - # np.log(-np.log(U + eps) + eps).astype("float32")) +def gumbel_softmax(logits, epoch, temperature=1.0, hard=True, eps=1e-10): + temperature = temperature * (0.98**epoch) U = np.random.gumbel(0, 1, logits.shape).astype("float32") logits = logits + to_variable(U) @@ -98,13 +91,13 @@ def gumbel_softmax(logits, temperature=1, hard=True, eps=1e-10): if hard: maxes = fluid.layers.reduce_max(logits, dim=1, keep_dim=True) hard = fluid.layers.cast((logits == maxes), logits.dtype) - # out = hard - logits.detach() + logits - tmp = hard - logits - tmp.stop_gradient = True - out = tmp + logits + index = np.argmax(hard.numpy(), axis=1) + out = hard - logits.detach() + logits + # tmp.stop_gradient = True + # out = tmp + logits else: out = logits - return out + return out, index class Zero(fluid.dygraph.Layer): @@ -135,8 +128,6 @@ class ReluConvBN(fluid.dygraph.Layer): use_cudnn=True, name=None): super(ReluConvBN, self).__init__() - #conv_std = (2.0 / - # (filter_size[0] * filter_size[1] * out_c * in_c))**0.5 conv_param = fluid.ParamAttr( name=name if name is None else (name + "_conv.weights"), initializer=fluid.initializer.MSRA()) @@ -184,7 +175,7 @@ class Cell(fluid.dygraph.Layer): ops.append(op) self._ops = fluid.dygraph.LayerList(ops) - def forward(self, s0, s1, weights): + def forward(self, s0, s1, weights, index): s0 = self.preprocess0(s0) s1 = self.preprocess1(s1) @@ -192,7 +183,8 @@ class Cell(fluid.dygraph.Layer): offset = 0 for i in range(self._steps): s = fluid.layers.sums([ - self._ops[offset + j](h, weights[offset + j]) + self._ops[offset + j](h, weights[offset + j], + index[offset + j]) for j, h in enumerate(states) ]) offset += len(states) @@ -216,12 +208,27 @@ class EncoderLayer(Layer): super(EncoderLayer, self).__init__() self._n_layer = n_layer self._hidden_size = hidden_size - self._n_channel = 256 + self._n_channel = 128 self._steps = 3 self._n_ops = len(ConvBN_PRIMITIVES) self.use_fixed_gumbel = use_fixed_gumbel - self.stem = fluid.dygraph.Sequential( + self.stem0 = fluid.dygraph.Sequential( + Conv2D( + num_channels=1, + num_filters=self._n_channel, + filter_size=[3, self._hidden_size], + padding=[1, 0], + param_attr=fluid.ParamAttr(initializer=MSRA()), + bias_attr=False), + BatchNorm( + num_channels=self._n_channel, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0)))) + + self.stem1 = fluid.dygraph.Sequential( Conv2D( num_channels=1, num_filters=self._n_channel, @@ -254,11 +261,7 @@ class EncoderLayer(Layer): default_initializer=NormalInitializer( loc=0.0, scale=1e-3)) - # self.k = fluid.layers.create_parameter( - # shape=[1, self._n_layer], - # dtype="float32", - # default_initializer=NormalInitializer( - # loc=0.0, scale=1e-3)) + self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True) self.BN = BatchNorm( num_channels=self._n_channel, @@ -269,38 +272,58 @@ class EncoderLayer(Layer): initializer=fluid.initializer.Constant(value=0), trainable=False)) - self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True) - - self.out = Linear( - self._n_channel, - 3, - param_attr=ParamAttr(initializer=MSRA()), - bias_attr=ParamAttr(initializer=MSRA())) + self.bns = [] + self.outs = [] + for i in range(self._n_layer): + bn = BatchNorm( + num_channels=self._n_channel, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1), + trainable=False), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0), + trainable=False)) + out = Linear( + self._n_channel, + 3, + param_attr=ParamAttr(initializer=MSRA()), + bias_attr=ParamAttr(initializer=MSRA())) + self.bns.append(bn) + self.outs.append(out) + self._bns = fluid.dygraph.LayerList(self.bns) + self._outs = fluid.dygraph.LayerList(self.outs) + + self.pooled_fc = Linear( + input_dim=self._n_channel, + output_dim=self._hidden_size, + param_attr=fluid.ParamAttr( + name=self.full_name() + "pooled_fc.w_0", + initializer=fluid.initializer.TruncatedNormal(scale=1.0)), + bias_attr=fluid.ParamAttr(name=self.full_name() + "pooled_fc.b_0"), + act="tanh") self.use_fixed_gumbel = use_fixed_gumbel - self.gumbel_alphas = gumbel_softmax(self.alphas).detach() + self.gumbel_alphas = gumbel_softmax(self.alphas, 0)[0].detach() + #print("gumbel_alphas: \n", self.gumbel_alphas.numpy()) - def forward(self, enc_input_0, enc_input_1, flops=[], model_size=[]): - alphas = self.gumbel_alphas if self.use_fixed_gumbel else gumbel_softmax( - self.alphas) + def forward(self, enc_input_0, enc_input_1, epoch, flops=[], + model_size=[]): + alphas, index = self.gumbel_alphas if self.use_fixed_gumbel else gumbel_softmax( + self.alphas, epoch) - s0 = fluid.layers.reshape( - enc_input_0, [-1, 1, enc_input_0.shape[1], enc_input_0.shape[2]]) - s1 = fluid.layers.reshape( - enc_input_1, [-1, 1, enc_input_1.shape[1], enc_input_1.shape[2]]) - # (bs, 1, seq_len, hidden_size) - - s0 = self.stem(s0) - s1 = self.stem(s1) - # (bs, n_channel, seq_len, 1) + s0 = fluid.layers.unsqueeze(enc_input_0, [1]) + s1 = fluid.layers.unsqueeze(enc_input_1, [1]) + s0 = self.stem0(s0) + s1 = self.stem1(s1) + enc_outputs = [] for i in range(self._n_layer): - s0, s1 = s1, self._cells[i](s0, s1, alphas) - # (bs, n_channel, seq_len, 1) - - s1 = self.BN(s1) - - outputs = self.pool2d_avg(s1) - outputs = fluid.layers.reshape(outputs, shape=[-1, 0]) - outputs = self.out(outputs) - return outputs + s0, s1 = s1, self._cells[i](s0, s1, alphas, index) + # (bs, n_channel, seq_len, 1) + tmp = self._bns[i](s1) + tmp = self.pool2d_avg(tmp) + tmp = fluid.layers.reshape(tmp, shape=[-1, 0]) + tmp = self._outs[i](tmp) + enc_outputs.append(tmp) + + return enc_outputs diff --git a/paddleslim/teachers/bert/cls.py b/paddleslim/teachers/bert/cls.py index 9b27eb67..8f7e1a4b 100755 --- a/paddleslim/teachers/bert/cls.py +++ b/paddleslim/teachers/bert/cls.py @@ -58,7 +58,8 @@ class BERTClassifier(Layer): num_labels, task_name="mnli", model_path=None, - use_cuda=True): + use_cuda=True, + return_pooled_out=True): super(BERTClassifier, self).__init__() self.task_name = task_name.lower() BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/" @@ -84,7 +85,7 @@ class BERTClassifier(Layer): } self.cls_model = ClsModelLayer( - self.bert_config, num_labels, return_pooled_out=True) + self.bert_config, num_labels, return_pooled_out=return_pooled_out) if model_path is not None: #restore the model @@ -101,6 +102,9 @@ class BERTClassifier(Layer): "You should load pretrained model for training this teacher model." ) + def emb_names(self): + return self.cls_model.emb_names() + def forward(self, input): return self.cls_model(input) diff --git a/paddleslim/teachers/bert/model/bert.py b/paddleslim/teachers/bert/model/bert.py index bf4ae641..d09c0dea 100644 --- a/paddleslim/teachers/bert/model/bert.py +++ b/paddleslim/teachers/bert/model/bert.py @@ -122,6 +122,10 @@ class BertModelLayer(Layer): postprocess_cmd="dan", param_initializer=self._param_initializer) + def emb_names(self): + return self._src_emb.parameters() + self._pos_emb.parameters( + ) + self._sent_emb.parameters() + def forward(self, src_ids, position_ids, sentence_ids, input_mask): """ forward diff --git a/paddleslim/teachers/bert/model/cls.py b/paddleslim/teachers/bert/model/cls.py index 76ee6ad9..bdfef8b5 100644 --- a/paddleslim/teachers/bert/model/cls.py +++ b/paddleslim/teachers/bert/model/cls.py @@ -46,6 +46,7 @@ class ClsModelLayer(Layer): self.use_fp16 = use_fp16 self.loss_scaling = loss_scaling self.n_layers = config['num_hidden_layers'] + self.return_pooled_out = return_pooled_out self.bert_layer = BertModelLayer( config=self.config, return_pooled_out=True, use_fp16=self.use_fp16) @@ -64,6 +65,9 @@ class ClsModelLayer(Layer): fc = self.add_sublayer("cls_fc_%d" % i, fc) self.cls_fc.append(fc) + def emb_names(self): + return self.bert_layer.emb_names() + def forward(self, data_ids): """ forward @@ -76,11 +80,23 @@ class ClsModelLayer(Layer): enc_outputs, next_sent_feats = self.bert_layer( src_ids, position_ids, sentence_ids, input_mask) + + if not self.return_pooled_out: + cls_feat = fluid.layers.dropout( + x=next_sent_feats[-1], + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + logits = self.cls_fc[-1](cls_feat) + probs = fluid.layers.softmax(logits) + num_seqs = fluid.layers.create_tensor(dtype='int64') + accuracy = fluid.layers.accuracy( + input=probs, label=labels, total=num_seqs) + return enc_outputs, logits, accuracy, num_seqs + logits = [] losses = [] accuracys = [] for next_sent_feat, fc in zip(next_sent_feats, self.cls_fc): - cls_feat = fluid.layers.dropout( x=next_sent_feat, dropout_prob=0.1, diff --git a/paddleslim/teachers/bert/reader/cls.py b/paddleslim/teachers/bert/reader/cls.py index 23d62d5c..56e54531 100644 --- a/paddleslim/teachers/bert/reader/cls.py +++ b/paddleslim/teachers/bert/reader/cls.py @@ -16,6 +16,7 @@ import io import os import types import csv +import random import numpy as np from . import tokenization from .batching import prepare_batch_data @@ -110,9 +111,9 @@ class DataProcessor(object): def get_num_examples(self, phase): """Get number of examples for train, dev or test.""" - if phase not in ['train', 'dev', 'test']: - raise ValueError( - "Unknown phase, which should be in ['train', 'dev', 'test'].") + #if phase not in ['train', 'dev', 'test']: + # raise ValueError( + # "Unknown phase, which should be in ['train', 'dev', 'test'].") return self.num_examples[phase] def get_train_progress(self): @@ -135,6 +136,8 @@ class DataProcessor(object): epoch: int. Total epoches to generate data. shuffle: bool. Whether to shuffle examples. """ + search_examples = self.get_train_examples(self.data_dir) + random.shuffle(search_examples) if phase == 'train': examples = self.get_train_examples(self.data_dir) self.num_examples['train'] = len(examples) @@ -145,13 +148,13 @@ class DataProcessor(object): examples = self.get_test_examples(self.data_dir) self.num_examples['test'] = len(examples) elif phase == 'search_train': - examples = self.get_train_examples(self.data_dir) - self.num_examples['search_train'] = len(examples) / 2 - examples = examples[:self.num_examples['search_train']] + #examples = self.get_train_examples(self.data_dir) + self.num_examples['search_train'] = len(search_examples) / 2 + examples = search_examples[:self.num_examples['search_train']] elif phase == 'search_valid': - examples = self.get_train_examples(self.data_dir) - self.num_examples['search_valid'] = len(examples) / 2 - examples = examples[self.num_examples['search_train']:] + #examples = self.get_train_examples(self.data_dir) + self.num_examples['search_valid'] = len(search_examples) / 2 + examples = search_examples[self.num_examples['search_valid']:] else: raise ValueError( "Unknown phase, which should be in ['train', 'dev', 'test'].") @@ -340,7 +343,7 @@ class MnliProcessor(DataProcessor): def get_train_examples(self, data_dir): """See base class.""" return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + self._read_tsv(os.path.join(data_dir, "train_aug.tsv")), "train") def get_dev_examples(self, data_dir): """See base class.""" -- GitLab