diff --git a/demo/bert/train_distill.py b/demo/bert/train_distill.py index c9e042f54b5f43d90c31e15b77ec2cb3a9260142..44eecbf4388eb443cc79241bea23fb69c99a0e9c 100755 --- a/demo/bert/train_distill.py +++ b/demo/bert/train_distill.py @@ -81,20 +81,33 @@ def main(): BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12" vocab_path = BERT_BASE_PATH + "/vocab.txt" - data_dir = "./data/glue_data/MNLI/" - teacher_model_dir = "./data/teacher_model/steps_23000" + do_lower_case = True - num_samples = 392702 # augmented dataset nums # num_samples = 8016987 + max_seq_len = 128 batch_size = 192 hidden_size = 768 emb_size = 768 - max_layer = 8 epoch = 80 log_freq = 10 + task_name = 'mnli' + + if task_name == 'mrpc': + data_dir = "./data/glue_data/MRPC/" + teacher_model_dir = "./data/teacher_model/mrpc" + num_samples = 3668 + max_layer = 4 + processor_func = MrpcProcessor + elif task_name == 'mnli': + data_dir = "./data/glue_data/MNLI/" + teacher_model_dir = "./data/teacher_model/step_23000" + num_samples = 392702 + max_layer = 8 + processor_func = MnliProcessor + device_num = fluid.dygraph.parallel.Env().nranks use_fixed_gumbel = True train_phase = "train" @@ -107,9 +120,10 @@ def main(): np.random.seed(1) fluid.default_main_program().random_seed = 1 model = AdaBERTClassifier( - 3, + 2, n_layer=max_layer, hidden_size=hidden_size, + task_name=task_name, emb_size=emb_size, teacher_model=teacher_model_dir, data_dir=data_dir, @@ -130,7 +144,7 @@ def main(): regularization=fluid.regularizer.L2DecayRegularizer(3e-4), parameter_list=model_parameters) - processor = MnliProcessor( + processor = processor_func( data_dir=data_dir, vocab_path=vocab_path, max_seq_len=max_seq_len, @@ -172,12 +186,16 @@ def main(): strategy = fluid.dygraph.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) + best_valid_acc = 0 for epoch_id in range(epoch): train_one_epoch(model, train_loader, optimizer, epoch_id, use_data_parallel, log_freq) loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq) - logger.info("dev set, ce_loss {:.6f}; acc: {:.6f};".format(loss, - acc)) + if acc > best_valid_acc: + best_valid_acc = acc + logger.info( + "dev set, ce_loss {:.6f}; acc {:.6f}, best_acc {:.6f};".format( + loss, acc, best_valid_acc)) if __name__ == '__main__': diff --git a/demo/bert/train_search.py b/demo/bert/train_search.py index 0095396573797e347158d6e1dfc1882f0f0373f8..685c9f995ca2805e6c03cabc68d702ff9600da78 100755 --- a/demo/bert/train_search.py +++ b/demo/bert/train_search.py @@ -222,9 +222,9 @@ def main(): acc)) if use_data_parallel: - print(model.student._encoder.alphas.numpy()) - else: print(model._layers.student._encoder.alphas.numpy()) + else: + print(model.student._encoder.alphas.numpy()) print("=" * 100) diff --git a/paddleslim/nas/darts/search_space/conv_bert/cls.py b/paddleslim/nas/darts/search_space/conv_bert/cls.py index d2452efb338dc046e16037255507fcdccff9deae..3c46c443117399993d22200f8832b1bd1f66dcf2 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/cls.py +++ b/paddleslim/nas/darts/search_space/conv_bert/cls.py @@ -49,6 +49,7 @@ class AdaBERTClassifier(Layer): hidden_size=768, gamma=0.8, beta=4, + task_name='mnli', conv_type="conv_bn", search_layer=False, teacher_model=None, @@ -75,7 +76,7 @@ class AdaBERTClassifier(Layer): "----------------------load teacher model and test----------------------------------------" ) self.teacher = BERTClassifier( - num_labels, model_path=self._teacher_model) + num_labels, task_name=task_name, model_path=self._teacher_model) # global setting, will be overwritten when training(about 1% acc loss) self.teacher.eval() self.teacher.test(self._data_dir) @@ -83,6 +84,7 @@ class AdaBERTClassifier(Layer): "----------------------finish load teacher model and test----------------------------------------" ) self.student = BertModelLayer( + num_labels=num_labels, n_layer=self._n_layer, emb_size=self._emb_size, hidden_size=self._hidden_size, diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py index 67e25a65b9ac2f4cf4af0a2ff20aa315c7542de2..38de45e6d88058800b4880d7d2354f5a2b5605f9 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py @@ -32,6 +32,7 @@ from .transformer_encoder import EncoderLayer class BertModelLayer(Layer): def __init__(self, + num_labels, emb_size=128, hidden_size=768, n_layer=12, @@ -91,6 +92,7 @@ class BertModelLayer(Layer): param_attr=fluid.ParamAttr(name="s_emb_factorization")) self._encoder = EncoderLayer( + num_labels=num_labels, n_layer=self._n_layer, hidden_size=self._hidden_size, search_layer=self._search_layer, diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py index f23d7a225df4631ee426089b299707ad9c73f3b8..bad9a1e266e04022543269b32f3b45e709d97451 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py @@ -200,6 +200,7 @@ class EncoderLayer(Layer): """ def __init__(self, + num_labels, n_layer, hidden_size=768, name="encoder", @@ -276,7 +277,7 @@ class EncoderLayer(Layer): trainable=False)) out = Linear( self._n_channel, - 3, + num_labels, param_attr=ParamAttr(initializer=MSRA()), bias_attr=ParamAttr(initializer=MSRA())) self.bns.append(bn)