From 8c7465b2fb4fe1259905e8c243398fe4f0f7cb11 Mon Sep 17 00:00:00 2001 From: Bai Yifan Date: Thu, 9 Jul 2020 13:14:49 +0800 Subject: [PATCH] Update bert distillation and search code (#376) --- demo/bert/search_bert.py | 56 ----- demo/bert/train_cell_base.py | 195 --------------- demo/bert/train_distill.py | 204 +++++++++++++++ demo/bert/train_search.py | 232 ++++++++++++++++++ paddleslim/nas/darts/architect_for_bert.py | 8 +- .../nas/darts/search_space/conv_bert/cls.py | 100 +++----- .../search_space/conv_bert/model/bert.py | 53 +--- .../conv_bert/model/transformer_encoder.py | 138 +++++------ paddleslim/teachers/bert/cls.py | 5 +- paddleslim/teachers/bert/model/cls.py | 15 +- paddleslim/teachers/bert/reader/cls.py | 52 +++- 11 files changed, 619 insertions(+), 439 deletions(-) delete mode 100644 demo/bert/search_bert.py delete mode 100755 demo/bert/train_cell_base.py create mode 100755 demo/bert/train_distill.py create mode 100755 demo/bert/train_search.py diff --git a/demo/bert/search_bert.py b/demo/bert/search_bert.py deleted file mode 100644 index a97739f3..00000000 --- a/demo/bert/search_bert.py +++ /dev/null @@ -1,56 +0,0 @@ -import paddle.fluid as fluid -from paddleslim.teachers.bert.reader.cls import * -from paddleslim.nas.darts.search_space import AdaBERTClassifier -from paddleslim.nas.darts import DARTSearch - - -def main(): - place = fluid.CUDAPlace(0) - - BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/" - bert_config_path = BERT_BASE_PATH + "/bert_config.json" - vocab_path = BERT_BASE_PATH + "/vocab.txt" - data_dir = "./data/glue_data/MNLI/" - max_seq_len = 512 - do_lower_case = True - batch_size = 32 - epoch = 30 - - processor = MnliProcessor( - data_dir=data_dir, - vocab_path=vocab_path, - max_seq_len=max_seq_len, - do_lower_case=do_lower_case, - in_tokens=False) - - train_reader = processor.data_generator( - batch_size=batch_size, - phase='train', - epoch=epoch, - dev_count=1, - shuffle=True) - - val_reader = processor.data_generator( - batch_size=batch_size, - phase='train', - epoch=epoch, - dev_count=1, - shuffle=True) - - with fluid.dygraph.guard(place): - model = AdaBERTClassifier( - 3, - teacher_model="/work/PaddleSlim/demo/bert_1/checkpoints/steps_23000" - ) - searcher = DARTSearch( - model, - train_reader, - val_reader, - batchsize=batch_size, - num_epochs=epoch, - log_freq=10) - searcher.train() - - -if __name__ == '__main__': - main() diff --git a/demo/bert/train_cell_base.py b/demo/bert/train_cell_base.py deleted file mode 100755 index 34b28c11..00000000 --- a/demo/bert/train_cell_base.py +++ /dev/null @@ -1,195 +0,0 @@ -import numpy as np -from itertools import izip -import paddle.fluid as fluid -from paddleslim.teachers.bert.reader.cls import * -from paddleslim.nas.darts.search_space import AdaBERTClassifier -from paddleslim.nas.darts.architect_for_bert import Architect - -import logging -from paddleslim.common import AvgrageMeter, get_logger -logger = get_logger(__name__, level=logging.INFO) - - -def count_parameters_in_MB(all_params): - parameters_number = 0 - for param in all_params: - if param.trainable: - parameters_number += np.prod(param.shape) - return parameters_number / 1e6 - - -def model_loss(model, data_ids): - # src_ids = data_ids[0] - # position_ids = data_ids[1] - # sentence_ids = data_ids[2] - # input_mask = data_ids[3] - labels = data_ids[4] - labels.stop_gradient = True - - enc_output = model(data_ids) - - ce_loss, probs = fluid.layers.softmax_with_cross_entropy( - logits=enc_output, label=labels, return_softmax=True) - loss = fluid.layers.mean(x=ce_loss) - num_seqs = fluid.layers.create_tensor(dtype='int64') - accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs) - return loss, accuracy - - -def train_one_epoch(model, architect, train_loader, valid_loader, optimizer, - epoch, use_data_parallel, log_freq): - ce_losses = AvgrageMeter() - accs = AvgrageMeter() - model.train() - - step_id = 0 - for train_data, valid_data in izip(train_loader(), valid_loader): - architect.step(train_data, valid_data) - loss, acc = model_loss(model, train_data) - - if use_data_parallel: - loss = model.scale_loss(loss) - loss.backward() - model.apply_collective_grads() - else: - loss.backward() - - optimizer.minimize(loss) - model.clear_gradients() - - batch_size = train_data[0].shape[0] - ce_losses.update(loss.numpy(), batch_size) - accs.update(acc.numpy(), batch_size) - - if step_id % log_freq == 0: - logger.info( - "Train Epoch {}, Step {}, Lr {:.6f} loss {:.6f}; acc: {:.6f};". - format(epoch, step_id, - optimizer.current_step_lr(), ce_losses.avg[0], accs.avg[ - 0])) - step_id += 1 - - -def valid_one_epoch(model, valid_loader, epoch, log_freq): - ce_losses = AvgrageMeter() - accs = AvgrageMeter() - model.eval() - - step_id = 0 - for valid_data in valid_loader(): - loss, acc = model_loss(model, valid_data) - - batch_size = valid_data[0].shape[0] - ce_losses.update(loss.numpy(), batch_size) - accs.update(acc.numpy(), batch_size) - - if step_id % log_freq == 0: - logger.info("Valid Epoch {}, Step {}, loss {:.6f}; acc: {:.6f};". - format(epoch, step_id, ce_losses.avg[0], accs.avg[0])) - step_id += 1 - - -def main(): - use_data_parallel = False - place = fluid.CUDAPlace(fluid.dygraph.parallel.Env( - ).dev_id) if use_data_parallel else fluid.CUDAPlace(0) - - BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12" - bert_config_path = BERT_BASE_PATH + "/bert_config.json" - vocab_path = BERT_BASE_PATH + "/vocab.txt" - data_dir = "./data/glue_data/MNLI/" - teacher_model_dir = "./teacher_model/steps_23000" - num_samples = 392702 - max_seq_len = 128 - do_lower_case = True - batch_size = 128 - hidden_size = 768 - emb_size = 768 - max_layer = 8 - epoch = 80 - log_freq = 10 - use_fixed_gumbel = True - - processor = MnliProcessor( - data_dir=data_dir, - vocab_path=vocab_path, - max_seq_len=max_seq_len, - do_lower_case=do_lower_case, - in_tokens=False) - - train_reader = processor.data_generator( - batch_size=batch_size, - phase='search_train', - epoch=1, - dev_count=1, - shuffle=True) - - val_reader = processor.data_generator( - batch_size=batch_size, - phase='search_valid', - epoch=1, - dev_count=1, - shuffle=True) - - if use_data_parallel: - train_reader = fluid.contrib.reader.distributed_batch_reader( - train_reader) - valid_reader = fluid.contrib.reader.distributed_batch_reader( - valid_reader) - - with fluid.dygraph.guard(place): - model = AdaBERTClassifier( - 3, - n_layer=max_layer, - hidden_size=hidden_size, - emb_size=emb_size, - teacher_model=teacher_model_dir, - data_dir=data_dir, - use_fixed_gumbel=use_fixed_gumbel) - - if use_data_parallel: - strategy = fluid.dygraph.parallel.prepare_context() - model = fluid.dygraph.parallel.DataParallel(model, strategy) - - device_num = fluid.dygraph.parallel.Env().nranks - step_per_epoch = int(num_samples / (batch_size * device_num)) - learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch) - - model_parameters = [ - p for p in model.parameters() - if p.name not in [a.name for a in model.arch_parameters()] - ] - - clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0) - optimizer = fluid.optimizer.MomentumOptimizer( - learning_rate, - 0.9, - regularization=fluid.regularizer.L2DecayRegularizer(3e-4), - parameter_list=model_parameters, - grad_clip=clip) - - train_loader = fluid.io.DataLoader.from_generator( - capacity=1024, - use_double_buffer=True, - iterable=True, - return_list=True) - valid_loader = fluid.io.DataLoader.from_generator( - capacity=1024, - use_double_buffer=True, - iterable=True, - return_list=True) - train_loader.set_batch_generator(train_reader, places=place) - valid_loader.set_batch_generator(val_reader, places=place) - - architect = Architect(model, learning_rate, 3e-4, place, False) - - for epoch_id in range(epoch): - train_one_epoch(model, architect, train_loader, valid_loader, - optimizer, epoch_id, use_data_parallel, log_freq) - valid_one_epoch(model, valid_loader, epoch_id, log_freq) - print(model.student._encoder.alphas.numpy()) - print("=" * 100) - - -if __name__ == '__main__': - main() diff --git a/demo/bert/train_distill.py b/demo/bert/train_distill.py new file mode 100755 index 00000000..8f4758b0 --- /dev/null +++ b/demo/bert/train_distill.py @@ -0,0 +1,204 @@ +import numpy as np +from itertools import izip +import paddle.fluid as fluid +from paddleslim.teachers.bert.reader.cls import * +from paddleslim.nas.darts.search_space import AdaBERTClassifier +from paddle.fluid.dygraph.base import to_variable +from tqdm import tqdm +import os +import pickle + +import logging +from paddleslim.common import AvgrageMeter, get_logger +logger = get_logger(__name__, level=logging.INFO) + + +def valid_one_epoch(model, valid_loader, epoch, log_freq): + accs = AvgrageMeter() + ce_losses = AvgrageMeter() + model.student.eval() + + step_id = 0 + for valid_data in valid_loader(): + try: + loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch) + except: + loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch) + + batch_size = valid_data[0].shape[0] + ce_losses.update(ce_loss.numpy(), batch_size) + accs.update(acc.numpy(), batch_size) + step_id += 1 + return ce_losses.avg[0], accs.avg[0] + + +def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel, + log_freq): + total_losses = AvgrageMeter() + accs = AvgrageMeter() + ce_losses = AvgrageMeter() + kd_losses = AvgrageMeter() + model.student.train() + + step_id = 0 + for train_data in train_loader(): + batch_size = train_data[0].shape[0] + + if use_data_parallel: + total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss( + train_data, epoch) + else: + total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data, + epoch) + + if use_data_parallel: + total_loss = model.scale_loss(total_loss) + total_loss.backward() + model.apply_collective_grads() + else: + total_loss.backward() + optimizer.minimize(total_loss) + model.clear_gradients() + total_losses.update(total_loss.numpy(), batch_size) + accs.update(acc.numpy(), batch_size) + ce_losses.update(ce_loss.numpy(), batch_size) + kd_losses.update(kd_loss.numpy(), batch_size) + + if step_id % log_freq == 0: + logger.info( + "Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f};". + format(epoch, step_id, + optimizer.current_step_lr(), total_losses.avg[0], + ce_losses.avg[0], kd_losses.avg[0], accs.avg[0])) + step_id += 1 + + +def main(): + # whether use multi-gpus + use_data_parallel = False + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env( + ).dev_id) if use_data_parallel else fluid.CUDAPlace(0) + + BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12" + vocab_path = BERT_BASE_PATH + "/vocab.txt" + + do_lower_case = True + # augmented dataset nums + # num_samples = 8016987 + + max_seq_len = 128 + batch_size = 192 + hidden_size = 768 + emb_size = 768 + epoch = 80 + log_freq = 10 + + task_name = 'mnli' + + if task_name == 'mrpc': + data_dir = "./data/glue_data/MRPC/" + teacher_model_dir = "./data/teacher_model/mrpc" + num_samples = 3668 + max_layer = 4 + num_labels = 2 + processor_func = MrpcProcessor + elif task_name == 'mnli': + data_dir = "./data/glue_data/MNLI/" + teacher_model_dir = "./data/teacher_model/steps_23000" + num_samples = 392702 + max_layer = 8 + num_labels = 3 + processor_func = MnliProcessor + + device_num = fluid.dygraph.parallel.Env().nranks + use_fixed_gumbel = True + train_phase = "train" + val_phase = "dev" + step_per_epoch = int(num_samples / (batch_size * device_num)) + + with fluid.dygraph.guard(place): + if use_fixed_gumbel: + # make sure gumbel arch is constant + np.random.seed(1) + fluid.default_main_program().random_seed = 1 + model = AdaBERTClassifier( + num_labels, + n_layer=max_layer, + hidden_size=hidden_size, + task_name=task_name, + emb_size=emb_size, + teacher_model=teacher_model_dir, + data_dir=data_dir, + use_fixed_gumbel=use_fixed_gumbel) + + learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch) + + model_parameters = [] + for p in model.parameters(): + if (p.name not in [a.name for a in model.arch_parameters()] and + p.name not in + [a.name for a in model.teacher.parameters()]): + model_parameters.append(p) + + optimizer = fluid.optimizer.MomentumOptimizer( + learning_rate, + 0.9, + regularization=fluid.regularizer.L2DecayRegularizer(3e-4), + parameter_list=model_parameters) + + processor = processor_func( + data_dir=data_dir, + vocab_path=vocab_path, + max_seq_len=max_seq_len, + do_lower_case=do_lower_case, + in_tokens=False) + + train_reader = processor.data_generator( + batch_size=batch_size, + phase=train_phase, + epoch=1, + dev_count=1, + shuffle=True) + dev_reader = processor.data_generator( + batch_size=batch_size, + phase=val_phase, + epoch=1, + dev_count=1, + shuffle=False) + + if use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + + train_loader = fluid.io.DataLoader.from_generator( + capacity=128, + use_double_buffer=True, + iterable=True, + return_list=True) + dev_loader = fluid.io.DataLoader.from_generator( + capacity=128, + use_double_buffer=True, + iterable=True, + return_list=True) + + train_loader.set_batch_generator(train_reader, places=place) + dev_loader.set_batch_generator(dev_reader, places=place) + + if use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + model = fluid.dygraph.parallel.DataParallel(model, strategy) + + best_valid_acc = 0 + for epoch_id in range(epoch): + train_one_epoch(model, train_loader, optimizer, epoch_id, + use_data_parallel, log_freq) + loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq) + if acc > best_valid_acc: + best_valid_acc = acc + logger.info( + "dev set, ce_loss {:.6f}; acc {:.6f}, best_acc {:.6f};".format( + loss, acc, best_valid_acc)) + + +if __name__ == '__main__': + main() diff --git a/demo/bert/train_search.py b/demo/bert/train_search.py new file mode 100755 index 00000000..685c9f99 --- /dev/null +++ b/demo/bert/train_search.py @@ -0,0 +1,232 @@ +import numpy as np +from itertools import izip +import paddle.fluid as fluid +from paddleslim.teachers.bert.reader.cls import * +from paddleslim.nas.darts.search_space import AdaBERTClassifier +from paddle.fluid.dygraph.base import to_variable +from tqdm import tqdm +import os +import pickle + +import logging +from paddleslim.common import AvgrageMeter, get_logger +logger = get_logger(__name__, level=logging.INFO) + + +def valid_one_epoch(model, valid_loader, epoch, log_freq): + accs = AvgrageMeter() + ce_losses = AvgrageMeter() + model.student.eval() + + step_id = 0 + for valid_data in valid_loader(): + try: + loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch) + except: + loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch) + + batch_size = valid_data[0].shape[0] + ce_losses.update(ce_loss.numpy(), batch_size) + accs.update(acc.numpy(), batch_size) + step_id += 1 + return ce_losses.avg[0], accs.avg[0] + + +def train_one_epoch(model, train_loader, valid_loader, optimizer, + arch_optimizer, epoch, use_data_parallel, log_freq): + total_losses = AvgrageMeter() + accs = AvgrageMeter() + ce_losses = AvgrageMeter() + kd_losses = AvgrageMeter() + val_accs = AvgrageMeter() + model.student.train() + + step_id = 0 + for train_data, valid_data in izip(train_loader(), valid_loader()): + batch_size = train_data[0].shape[0] + # make sure arch on every gpu is same, otherwise an error will occurs + np.random.seed(step_id * 2 * (epoch + 1)) + if use_data_parallel: + total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss( + train_data, epoch) + else: + total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data, + epoch) + + if use_data_parallel: + total_loss = model.scale_loss(total_loss) + total_loss.backward() + model.apply_collective_grads() + else: + total_loss.backward() + optimizer.minimize(total_loss) + model.clear_gradients() + total_losses.update(total_loss.numpy(), batch_size) + accs.update(acc.numpy(), batch_size) + ce_losses.update(ce_loss.numpy(), batch_size) + kd_losses.update(kd_loss.numpy(), batch_size) + + # make sure arch on every gpu is same, otherwise an error will occurs + np.random.seed(step_id * 2 * (epoch + 1) + 1) + if use_data_parallel: + arch_loss, _, _, _, arch_logits = model._layers.loss(valid_data, + epoch) + else: + arch_loss, _, _, _, arch_logits = model.loss(valid_data, epoch) + + if use_data_parallel: + arch_loss = model.scale_loss(arch_loss) + arch_loss.backward() + model.apply_collective_grads() + else: + arch_loss.backward() + arch_optimizer.minimize(arch_loss) + model.clear_gradients() + probs = fluid.layers.softmax(arch_logits[-1]) + val_acc = fluid.layers.accuracy(input=probs, label=valid_data[4]) + val_accs.update(val_acc.numpy(), batch_size) + + if step_id % log_freq == 0: + logger.info( + "Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f}, search_valid_acc {:.6f};". + format(epoch, step_id, + optimizer.current_step_lr(), total_losses.avg[ + 0], ce_losses.avg[0], kd_losses.avg[0], accs.avg[0], + val_accs.avg[0])) + + step_id += 1 + + +def main(): + # whether use multi-gpus + use_data_parallel = False + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env( + ).dev_id) if use_data_parallel else fluid.CUDAPlace(0) + + BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12" + vocab_path = BERT_BASE_PATH + "/vocab.txt" + data_dir = "./data/glue_data/MNLI/" + teacher_model_dir = "./data/teacher_model/steps_23000" + do_lower_case = True + num_samples = 392702 + # augmented dataset nums + # num_samples = 8016987 + max_seq_len = 128 + batch_size = 128 + hidden_size = 768 + emb_size = 768 + max_layer = 8 + epoch = 80 + log_freq = 10 + device_num = fluid.dygraph.parallel.Env().nranks + + use_fixed_gumbel = False + train_phase = "search_train" + val_phase = "search_valid" + step_per_epoch = int(num_samples * 0.5 / ((batch_size) * device_num)) + + with fluid.dygraph.guard(place): + model = AdaBERTClassifier( + 3, + n_layer=max_layer, + hidden_size=hidden_size, + emb_size=emb_size, + teacher_model=teacher_model_dir, + data_dir=data_dir, + use_fixed_gumbel=use_fixed_gumbel) + + learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch) + + model_parameters = [] + for p in model.parameters(): + if (p.name not in [a.name for a in model.arch_parameters()] and + p.name not in + [a.name for a in model.teacher.parameters()]): + model_parameters.append(p) + + optimizer = fluid.optimizer.MomentumOptimizer( + learning_rate, + 0.9, + regularization=fluid.regularizer.L2DecayRegularizer(3e-4), + parameter_list=model_parameters) + + arch_optimizer = fluid.optimizer.Adam( + 3e-4, + 0.5, + 0.999, + regularization=fluid.regularizer.L2Decay(1e-3), + parameter_list=model.arch_parameters()) + + processor = MnliProcessor( + data_dir=data_dir, + vocab_path=vocab_path, + max_seq_len=max_seq_len, + do_lower_case=do_lower_case, + in_tokens=False) + + train_reader = processor.data_generator( + batch_size=batch_size, + phase=train_phase, + epoch=1, + dev_count=1, + shuffle=True) + valid_reader = processor.data_generator( + batch_size=batch_size, + phase=val_phase, + epoch=1, + dev_count=1, + shuffle=True) + dev_reader = processor.data_generator( + batch_size=batch_size, + phase="dev", + epoch=1, + dev_count=1, + shuffle=False) + + if use_data_parallel: + train_reader = fluid.contrib.reader.distributed_batch_reader( + train_reader) + valid_reader = fluid.contrib.reader.distributed_batch_reader( + valid_reader) + + train_loader = fluid.io.DataLoader.from_generator( + capacity=128, + use_double_buffer=True, + iterable=True, + return_list=True) + valid_loader = fluid.io.DataLoader.from_generator( + capacity=128, + use_double_buffer=True, + iterable=True, + return_list=True) + dev_loader = fluid.io.DataLoader.from_generator( + capacity=128, + use_double_buffer=True, + iterable=True, + return_list=True) + + train_loader.set_batch_generator(train_reader, places=place) + valid_loader.set_batch_generator(valid_reader, places=place) + dev_loader.set_batch_generator(dev_reader, places=place) + + if use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() + model = fluid.dygraph.parallel.DataParallel(model, strategy) + + for epoch_id in range(epoch): + train_one_epoch(model, train_loader, valid_loader, optimizer, + arch_optimizer, epoch_id, use_data_parallel, + log_freq) + loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq) + logger.info("dev set, ce_loss {:.6f}; acc: {:.6f};".format(loss, + acc)) + + if use_data_parallel: + print(model._layers.student._encoder.alphas.numpy()) + else: + print(model.student._encoder.alphas.numpy()) + print("=" * 100) + + +if __name__ == '__main__': + main() diff --git a/paddleslim/nas/darts/architect_for_bert.py b/paddleslim/nas/darts/architect_for_bert.py index 2c4f2645..b1f6df26 100644 --- a/paddleslim/nas/darts/architect_for_bert.py +++ b/paddleslim/nas/darts/architect_for_bert.py @@ -49,17 +49,17 @@ class Architect(object): self.network_weight_decay), parameter_list=self.unrolled_model_params) - def step(self, train_data, valid_data): + def step(self, train_data, valid_data, epoch): if self.unrolled: params_grads = self._backward_step_unrolled(train_data, valid_data) self.optimizer.apply_gradients(params_grads) else: - loss = self._backward_step(valid_data) + loss = self._backward_step(valid_data, epoch) self.optimizer.minimize(loss) self.optimizer.clear_gradients() - def _backward_step(self, valid_data): - loss = self.model.loss(valid_data) + def _backward_step(self, valid_data, epoch): + loss = self.model.loss(valid_data, epoch) loss[0].backward() return loss[0] diff --git a/paddleslim/nas/darts/search_space/conv_bert/cls.py b/paddleslim/nas/darts/search_space/conv_bert/cls.py index 54a65a4e..3c46c443 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/cls.py +++ b/paddleslim/nas/darts/search_space/conv_bert/cls.py @@ -31,6 +31,7 @@ import multiprocessing import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import to_variable, Layer, Linear +from paddle.fluid.dygraph.base import to_variable from .reader.cls import * from .model.bert import BertModelLayer from .optimization import Optimizer @@ -48,6 +49,7 @@ class AdaBERTClassifier(Layer): hidden_size=768, gamma=0.8, beta=4, + task_name='mnli', conv_type="conv_bn", search_layer=False, teacher_model=None, @@ -68,17 +70,21 @@ class AdaBERTClassifier(Layer): self._teacher_model = teacher_model self._data_dir = data_dir self.use_fixed_gumbel = use_fixed_gumbel + self.T = t print( "----------------------load teacher model and test----------------------------------------" ) self.teacher = BERTClassifier( - num_labels, model_path=self._teacher_model) + num_labels, task_name=task_name, model_path=self._teacher_model) + # global setting, will be overwritten when training(about 1% acc loss) + self.teacher.eval() self.teacher.test(self._data_dir) print( "----------------------finish load teacher model and test----------------------------------------" ) self.student = BertModelLayer( + num_labels=num_labels, n_layer=self._n_layer, emb_size=self._emb_size, hidden_size=self._hidden_size, @@ -87,6 +93,7 @@ class AdaBERTClassifier(Layer): use_fixed_gumbel=self.use_fixed_gumbel, gumbel_alphas=gumbel_alphas) + fix_emb = False for s_emb, t_emb in zip(self.student.emb_names(), self.teacher.emb_names()): t_emb.stop_gradient = True @@ -100,91 +107,58 @@ class AdaBERTClassifier(Layer): "Assigned embedding[{}] from teacher to embedding[{}] in student.". format(t_emb.name, s_emb.name)) - self.cls_fc = list() - for i in range(self._n_layer): - fc = Linear( - input_dim=self._hidden_size, - output_dim=self._num_labels, - param_attr=fluid.ParamAttr( - name="s_cls_out_%d_w" % i, - initializer=fluid.initializer.TruncatedNormal(scale=0.02)), - bias_attr=fluid.ParamAttr( - name="s_cls_out_%d_b" % i, - initializer=fluid.initializer.Constant(0.))) - fc = self.add_sublayer("cls_fc_%d" % i, fc) - self.cls_fc.append(fc) - - def forward(self, data_ids): - src_ids = data_ids[0] - position_ids = data_ids[1] - sentence_ids = data_ids[2] - return self.student(src_ids, position_ids, sentence_ids) + def forward(self, data_ids, epoch): + return self.student(data_ids, epoch) def arch_parameters(self): return self.student.arch_parameters() - def genotype(self): - return self.arch_parameters() - - def ce(self, logits): - logits = np.exp(logits - np.max(logits)) - logits = logits / logits.sum(axis=0) - return logits - - def loss(self, data_ids): - src_ids = data_ids[0] - position_ids = data_ids[1] - sentence_ids = data_ids[2] - input_mask = data_ids[3] + def loss(self, data_ids, epoch): labels = data_ids[4] - s_logits = self.student( - src_ids, position_ids, sentence_ids, flops=[], model_size=[]) + s_logits = self.student(data_ids, epoch) - self.teacher.eval() - total_loss, t_logits, t_losses, accuracys, num_seqs = self.teacher( - data_ids) - - # define kd loss - kd_losses = [] + t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids) + #define kd loss kd_weights = [] for i in range(len(s_logits)): j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) kd_weights.append(t_losses[j].numpy()) kd_weights = np.array(kd_weights) - kd_weights = self.ce(-kd_weights) - s_probs = None + kd_weights = np.squeeze(kd_weights) + kd_weights = to_variable(kd_weights) + kd_weights = fluid.layers.softmax(-kd_weights) + + kd_losses = [] for i in range(len(s_logits)): j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) t_logit = t_logits[j] s_logit = s_logits[i] t_logit.stop_gradient = True - t_probs = fluid.layers.softmax(t_logit / self.T) - s_probs = fluid.layers.softmax(s_logit) + t_probs = fluid.layers.softmax(t_logit) # P_j^T + s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S #kd_loss = -t_probs * fluid.layers.log(s_probs) kd_loss = fluid.layers.cross_entropy( input=s_probs, label=t_probs, soft_label=True) - kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1) - kd_loss = fluid.layers.mean(kd_loss) - # print("kd_loss[{}] = {}; kd_weights[{}] = {}".format(i, kd_loss.numpy(), i, kd_weights[i])) - # tmp = kd_loss * kd_weights[i] - tmp = fluid.layers.scale(kd_loss, scale=kd_weights[i]) - # print("kd_loss[{}] = {}".format(i, tmp.numpy())) - kd_losses.append(tmp) - + kd_loss = fluid.layers.reduce_mean(kd_loss) + kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i]) + kd_losses.append(kd_loss) kd_loss = fluid.layers.sum(kd_losses) - # print("kd_loss = {}".format(kd_loss.numpy())) + losses = [] + for logit in s_logits: + ce_loss, probs = fluid.layers.softmax_with_cross_entropy( + logits=logit, label=labels, return_softmax=True) + loss = fluid.layers.mean(x=ce_loss) + losses.append(loss) + + num_seqs = fluid.layers.create_tensor(dtype='int64') + accuracy = fluid.layers.accuracy( + input=probs, label=labels, total=num_seqs) + ce_loss = fluid.layers.sum(losses) - ce_loss, probs = fluid.layers.softmax_with_cross_entropy( - logits=s_logits[-1], label=labels, return_softmax=True) - ce_loss = fluid.layers.mean(x=ce_loss) - num_seqs = fluid.layers.create_tensor(dtype='int64') - accuracy = fluid.layers.accuracy( - input=probs, label=labels, total=num_seqs) + total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss - loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss - # return ce_loss, accuracy, None, None - return loss, accuracy, ce_loss, kd_loss + return total_loss, accuracy, ce_loss, kd_loss, s_logits diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py index b4aa9da7..38de45e6 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py @@ -32,6 +32,7 @@ from .transformer_encoder import EncoderLayer class BertModelLayer(Layer): def __init__(self, + num_labels, emb_size=128, hidden_size=768, n_layer=12, @@ -91,6 +92,7 @@ class BertModelLayer(Layer): param_attr=fluid.ParamAttr(name="s_emb_factorization")) self._encoder = EncoderLayer( + num_labels=num_labels, n_layer=self._n_layer, hidden_size=self._hidden_size, search_layer=self._search_layer, @@ -101,6 +103,10 @@ class BertModelLayer(Layer): return self._src_emb.parameters() + self._pos_emb.parameters( ) + self._sent_emb.parameters() + def emb_names(self): + return self._src_emb.parameters() + self._pos_emb.parameters( + ) + self._sent_emb.parameters() + def max_flops(self): return self._encoder.max_flops @@ -110,54 +116,19 @@ class BertModelLayer(Layer): def arch_parameters(self): return [self._encoder.alphas] #, self._encoder.k] - def forward(self, - src_ids, - position_ids, - sentence_ids, - flops=[], - model_size=[]): + def forward(self, data_ids, epoch): """ forward """ - ids = np.squeeze(src_ids.numpy()) - sids = np.squeeze(sentence_ids.numpy()) - batchsize = ids.shape[0] - - ids_0 = ids[((sids == 0) & (ids != 0))] - seqlen_0 = ((sids == 0) & (ids != 0)).astype(np.int64).sum(1) - y_0 = np.concatenate([np.arange(s) for s in seqlen_0]) - x_0 = np.concatenate([ - np.ones( - [s], dtype=np.int64) * i for i, s in enumerate(seqlen_0) - ]) - ids0 = np.zeros([batchsize, seqlen_0.max()], dtype=np.int64) - ids0[(x_0, y_0)] = ids_0 - - ids_1 = ids[(sids == 1) & (ids != 0)] - seqlen_1 = ((sids == 1) & (ids != 0)).astype(np.int64).sum(1) - y_1 = np.concatenate([np.arange(s) for s in seqlen_1]) - x_1 = np.concatenate([ - np.ones( - [s], dtype=np.int64) * i for i, s in enumerate(seqlen_1) - ]) - ids1 = np.zeros([batchsize, seqlen_1.max()], dtype=np.int64) - ids1[(x_1, y_1)] = ids_1 - - msl = max(seqlen_0.max(), seqlen_1.max()) - ids0 = np.pad(ids0, [[0, 0], [0, msl - seqlen_0.max()]], - mode='constant') - ids1 = np.pad(ids1, [[0, 0], [0, msl - seqlen_1.max()]], - mode='constant') - - ids0 = fluid.dygraph.to_variable(ids0) - ids1 = fluid.dygraph.to_variable(ids1) + ids0 = data_ids[5] + ids1 = data_ids[6] src_emb_0 = self._src_emb(ids0) src_emb_1 = self._src_emb(ids1) emb_out_0 = self._emb_fac(src_emb_0) emb_out_1 = self._emb_fac(src_emb_1) - # (bs, seq_len, 768) + # (bs, seq_len, hidden_size) + + enc_outputs = self._encoder(emb_out_0, emb_out_1, epoch) - enc_outputs = self._encoder( - emb_out, flops=flops, model_size=model_size) return enc_outputs diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py index 1f4e41cc..bad9a1e2 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py @@ -23,14 +23,15 @@ from collections import Iterable import paddle import paddle.fluid as fluid from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, Conv2D, BatchNorm, Pool2D, to_variable +from paddle.fluid.dygraph import to_variable from paddle.fluid.initializer import NormalInitializer from paddle.fluid import ParamAttr from paddle.fluid.initializer import MSRA, ConstantInitializer ConvBN_PRIMITIVES = [ 'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3', - 'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3', 'none', - 'skip_connect' + 'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3', + 'skip_connect', 'none' ] @@ -53,11 +54,6 @@ class MixedOp(fluid.dygraph.Layer): def __init__(self, n_channel, name=None): super(MixedOp, self).__init__() PRIMITIVES = ConvBN_PRIMITIVES - # ops = [ - # OPS[primitive](n_channel, name - # if name is None else name + "/" + primitive) - # for primitive in PRIMITIVES - # ] ops = [] for primitive in PRIMITIVES: op = OPS[primitive](n_channel, name @@ -76,26 +72,17 @@ class MixedOp(fluid.dygraph.Layer): self._ops = fluid.dygraph.LayerList(ops) def forward(self, x, weights): - #out = weights[0] * self._ops[0](x) # out = fluid.layers.sums( - # [weights[i] * op(x) for i, op in enumerate(self._ops)]) + # [weights[i] * op(x) for i, op in enumerate(self._ops)]) # return out - for i in range(len(self._ops)): - - if isinstance(weights, Iterable): - weights_i = weights[i] - else: - weights_i = weights[i].numpy() - - if weights_i != 0: + for i in range(len(weights.numpy())): + if weights[i].numpy() != 0: return self._ops[i](x) * weights[i] -def gumbel_softmax(logits, temperature=1, hard=True, eps=1e-10): - #U = np.random.uniform(0, 1, logits.shape) - #U = - to_variable( - # np.log(-np.log(U + eps) + eps).astype("float32")) +def gumbel_softmax(logits, epoch, temperature=1.0, hard=True, eps=1e-10): + temperature = temperature * (0.98**epoch) U = np.random.gumbel(0, 1, logits.shape).astype("float32") logits = logits + to_variable(U) @@ -105,12 +92,12 @@ def gumbel_softmax(logits, temperature=1, hard=True, eps=1e-10): if hard: maxes = fluid.layers.reduce_max(logits, dim=1, keep_dim=True) hard = fluid.layers.cast((logits == maxes), logits.dtype) - # out = hard - logits.detach() + logits - tmp = hard - logits - tmp.stop_gradient = True - out = tmp + logits + out = hard - logits.detach() + logits + # tmp.stop_gradient = True + # out = tmp + logits else: out = logits + return out @@ -142,8 +129,6 @@ class ReluConvBN(fluid.dygraph.Layer): use_cudnn=True, name=None): super(ReluConvBN, self).__init__() - #conv_std = (2.0 / - # (filter_size[0] * filter_size[1] * out_c * in_c))**0.5 conv_param = fluid.ParamAttr( name=name if name is None else (name + "_conv.weights"), initializer=fluid.initializer.MSRA()) @@ -215,6 +200,7 @@ class EncoderLayer(Layer): """ def __init__(self, + num_labels, n_layer, hidden_size=768, name="encoder", @@ -224,12 +210,27 @@ class EncoderLayer(Layer): super(EncoderLayer, self).__init__() self._n_layer = n_layer self._hidden_size = hidden_size - self._n_channel = 256 + self._n_channel = 128 self._steps = 3 self._n_ops = len(ConvBN_PRIMITIVES) self.use_fixed_gumbel = use_fixed_gumbel - self.stem = fluid.dygraph.Sequential( + self.stem0 = fluid.dygraph.Sequential( + Conv2D( + num_channels=1, + num_filters=self._n_channel, + filter_size=[3, self._hidden_size], + padding=[1, 0], + param_attr=fluid.ParamAttr(initializer=MSRA()), + bias_attr=False), + BatchNorm( + num_channels=self._n_channel, + param_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=1)), + bias_attr=fluid.ParamAttr( + initializer=fluid.initializer.Constant(value=0)))) + + self.stem1 = fluid.dygraph.Sequential( Conv2D( num_channels=1, num_filters=self._n_channel, @@ -262,16 +263,10 @@ class EncoderLayer(Layer): default_initializer=NormalInitializer( loc=0.0, scale=1e-3)) - # self.k = fluid.layers.create_parameter( - # shape=[1, self._n_layer], - # dtype="float32", - # default_initializer=NormalInitializer( - # loc=0.0, scale=1e-3)) self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True) self.bns = [] self.outs = [] for i in range(self._n_layer): - bn = BatchNorm( num_channels=self._n_channel, param_attr=fluid.ParamAttr( @@ -280,52 +275,53 @@ class EncoderLayer(Layer): bias_attr=fluid.ParamAttr( initializer=fluid.initializer.Constant(value=0), trainable=False)) - self.bns.append(bn) - out = Linear( self._n_channel, - 3, + num_labels, param_attr=ParamAttr(initializer=MSRA()), bias_attr=ParamAttr(initializer=MSRA())) + self.bns.append(bn) self.outs.append(out) + self._bns = fluid.dygraph.LayerList(self.bns) + self._outs = fluid.dygraph.LayerList(self.outs) self.use_fixed_gumbel = use_fixed_gumbel - self.gumbel_alphas = gumbel_softmax(self.alphas) - if gumbel_alphas is not None: - self.gumbel_alphas = np.array(gumbel_alphas).reshape( - self.alphas.shape) - else: - self.gumbel_alphas = gumbel_softmax(self.alphas) - self.gumbel_alphas.stop_gradient = True + #self.gumbel_alphas = gumbel_softmax(self.alphas, 0).detach() + + mrpc_arch = [ + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # std_conv7 0 # node 0 + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # dil_conv5 1 + [0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # std_conv7 0 # node 1 + [0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # dil_conv5 1 + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 2 + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 0 # node2 + [1, 0, 0, 0, 0, 0, 0, 0, 0, 0], # std_conv3 1 + [0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 2 + [0, 0, 0, 1, 0, 0, 0, 0, 0, 0] # dil_conv3 3 + ] + self.gumbel_alphas = to_variable( + np.array(mrpc_arch).astype(np.float32)) + self.gumbel_alphas.stop_gradient = True + print("gumbel_alphas: \n", self.gumbel_alphas.numpy()) + + def forward(self, enc_input_0, enc_input_1, epoch, flops=[], + model_size=[]): + alphas = self.gumbel_alphas if self.use_fixed_gumbel else gumbel_softmax( + self.alphas, epoch) - print("gumbel_alphas: {}".format(self.gumbel_alphas)) + s0 = fluid.layers.unsqueeze(enc_input_0, [1]) + s1 = fluid.layers.unsqueeze(enc_input_1, [1]) + s0 = self.stem0(s0) + s1 = self.stem1(s1) - def forward(self, enc_input_0, enc_input_1, flops=[], model_size=[]): - alphas = self.gumbel_alphas if self.use_fixed_gumbel else gumbel_softmax( - self.alphas) - - s0 = fluid.layers.reshape( - enc_input_0, [-1, 1, enc_input_0.shape[1], enc_input_0.shape[2]]) - s1 = fluid.layers.reshape( - enc_input_1, [-1, 1, enc_input_1.shape[1], enc_input_1.shape[2]]) - # (bs, 1, seq_len, hidden_size) - - s0 = self.stem(s0) - s1 = self.stem(s1) - # (bs, n_channel, seq_len, 1) - if self.use_fixed_gumbel: - alphas = self.gumbel_alphas - else: - alphas = gumbel_softmax(self.alphas) - - s0 = s1 = tmp - outputs = [] + enc_outputs = [] for i in range(self._n_layer): s0, s1 = s1, self._cells[i](s0, s1, alphas) - tmp = self.bns[i](s1) - tmp = self.pool2d_avg(tmp) # (bs, n_channel, seq_len, 1) + tmp = self._bns[i](s1) + tmp = self.pool2d_avg(tmp) tmp = fluid.layers.reshape(tmp, shape=[-1, 0]) - tmp = self.outs[i](tmp) - outputs.append(tmp) - return outputs + tmp = self._outs[i](tmp) + enc_outputs.append(tmp) + + return enc_outputs diff --git a/paddleslim/teachers/bert/cls.py b/paddleslim/teachers/bert/cls.py index ad64b2dc..8f7e1a4b 100755 --- a/paddleslim/teachers/bert/cls.py +++ b/paddleslim/teachers/bert/cls.py @@ -58,7 +58,8 @@ class BERTClassifier(Layer): num_labels, task_name="mnli", model_path=None, - use_cuda=True): + use_cuda=True, + return_pooled_out=True): super(BERTClassifier, self).__init__() self.task_name = task_name.lower() BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/" @@ -84,7 +85,7 @@ class BERTClassifier(Layer): } self.cls_model = ClsModelLayer( - self.bert_config, num_labels, return_pooled_out=True) + self.bert_config, num_labels, return_pooled_out=return_pooled_out) if model_path is not None: #restore the model diff --git a/paddleslim/teachers/bert/model/cls.py b/paddleslim/teachers/bert/model/cls.py index 7f84f44e..bdfef8b5 100644 --- a/paddleslim/teachers/bert/model/cls.py +++ b/paddleslim/teachers/bert/model/cls.py @@ -46,6 +46,7 @@ class ClsModelLayer(Layer): self.use_fp16 = use_fp16 self.loss_scaling = loss_scaling self.n_layers = config['num_hidden_layers'] + self.return_pooled_out = return_pooled_out self.bert_layer = BertModelLayer( config=self.config, return_pooled_out=True, use_fp16=self.use_fp16) @@ -79,11 +80,23 @@ class ClsModelLayer(Layer): enc_outputs, next_sent_feats = self.bert_layer( src_ids, position_ids, sentence_ids, input_mask) + + if not self.return_pooled_out: + cls_feat = fluid.layers.dropout( + x=next_sent_feats[-1], + dropout_prob=0.1, + dropout_implementation="upscale_in_train") + logits = self.cls_fc[-1](cls_feat) + probs = fluid.layers.softmax(logits) + num_seqs = fluid.layers.create_tensor(dtype='int64') + accuracy = fluid.layers.accuracy( + input=probs, label=labels, total=num_seqs) + return enc_outputs, logits, accuracy, num_seqs + logits = [] losses = [] accuracys = [] for next_sent_feat, fc in zip(next_sent_feats, self.cls_fc): - cls_feat = fluid.layers.dropout( x=next_sent_feat, dropout_prob=0.1, diff --git a/paddleslim/teachers/bert/reader/cls.py b/paddleslim/teachers/bert/reader/cls.py index d660fab4..cfdfaf38 100644 --- a/paddleslim/teachers/bert/reader/cls.py +++ b/paddleslim/teachers/bert/reader/cls.py @@ -16,6 +16,7 @@ import io import os import types import csv +import random import numpy as np from . import tokenization from .batching import prepare_batch_data @@ -139,6 +140,8 @@ class DataProcessor(object): epoch: int. Total epoches to generate data. shuffle: bool. Whether to shuffle examples. """ + search_examples = self.get_train_examples(self.data_dir) + random.shuffle(search_examples) if phase == 'train': examples = self.get_train_examples(self.data_dir) self.num_examples['train'] = len(examples) @@ -152,13 +155,13 @@ class DataProcessor(object): examples = self.get_test_examples(self.data_dir) self.num_examples['test'] = len(examples) elif phase == 'search_train': - examples = self.get_train_examples(self.data_dir) - self.num_examples['search_train'] = len(examples) / 2 - examples = examples[:self.num_examples['search_train']] + #examples = self.get_train_examples(self.data_dir) + self.num_examples['search_train'] = len(search_examples) / 2 + examples = search_examples[:self.num_examples['search_train']] elif phase == 'search_valid': - examples = self.get_train_examples(self.data_dir) - self.num_examples['search_valid'] = len(examples) / 2 - examples = examples[self.num_examples['search_train']:] + #examples = self.get_train_examples(self.data_dir) + self.num_examples['search_valid'] = len(search_examples) / 2 + examples = search_examples[self.num_examples['search_valid']:] else: raise ValueError( "Unknown phase, which should be in ['train', 'dev', 'test'].") @@ -213,16 +216,53 @@ class DataProcessor(object): return_input_mask=True, return_max_len=False, return_num_token=False) + if len(all_dev_batches) < dev_count: all_dev_batches.append(batch_data) if len(all_dev_batches) == dev_count: for batch in all_dev_batches: + batch = self.split_seq_pair(batch) yield batch all_dev_batches = [] return wrapper + def split_seq_pair(self, data_ids): + src_ids = data_ids[0] + sentence_ids = data_ids[2] + + ids = np.squeeze(src_ids) + sids = np.squeeze(sentence_ids) + batchsize = ids.shape[0] + + ids_0 = ids[((sids == 0) & (ids != 0))] + seqlen_0 = ((sids == 0) & (ids != 0)).astype(np.int64).sum(1) + y_0 = np.concatenate([np.arange(s) for s in seqlen_0]) + x_0 = np.concatenate([ + np.ones( + [s], dtype=np.int64) * i for i, s in enumerate(seqlen_0) + ]) + ids0 = np.zeros([batchsize, seqlen_0.max()], dtype=np.int64) + ids0[(x_0, y_0)] = ids_0 + + ids_1 = ids[(sids == 1) & (ids != 0)] + seqlen_1 = ((sids == 1) & (ids != 0)).astype(np.int64).sum(1) + y_1 = np.concatenate([np.arange(s) for s in seqlen_1]) + x_1 = np.concatenate([ + np.ones( + [s], dtype=np.int64) * i for i, s in enumerate(seqlen_1) + ]) + ids1 = np.zeros([batchsize, seqlen_1.max()], dtype=np.int64) + ids1[(x_1, y_1)] = ids_1 + + msl = max(seqlen_0.max(), seqlen_1.max()) + ids0 = np.pad(ids0, [[0, 0], [0, msl - seqlen_0.max()]], + mode='constant') + ids1 = np.pad(ids1, [[0, 0], [0, msl - seqlen_1.max()]], + mode='constant') + return data_ids + [ids0, ids1] + class InputExample(object): """A single training/test example for simple sequence classification.""" -- GitLab