未验证 提交 8c7465b2 编写于 作者: B Bai Yifan 提交者: GitHub

Update bert distillation and search code (#376)

上级 2bb6377f
import paddle.fluid as fluid
from paddleslim.teachers.bert.reader.cls import *
from paddleslim.nas.darts.search_space import AdaBERTClassifier
from paddleslim.nas.darts import DARTSearch
def main():
place = fluid.CUDAPlace(0)
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/"
bert_config_path = BERT_BASE_PATH + "/bert_config.json"
vocab_path = BERT_BASE_PATH + "/vocab.txt"
data_dir = "./data/glue_data/MNLI/"
max_seq_len = 512
do_lower_case = True
batch_size = 32
epoch = 30
processor = MnliProcessor(
data_dir=data_dir,
vocab_path=vocab_path,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
in_tokens=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase='train',
epoch=epoch,
dev_count=1,
shuffle=True)
val_reader = processor.data_generator(
batch_size=batch_size,
phase='train',
epoch=epoch,
dev_count=1,
shuffle=True)
with fluid.dygraph.guard(place):
model = AdaBERTClassifier(
3,
teacher_model="/work/PaddleSlim/demo/bert_1/checkpoints/steps_23000"
)
searcher = DARTSearch(
model,
train_reader,
val_reader,
batchsize=batch_size,
num_epochs=epoch,
log_freq=10)
searcher.train()
if __name__ == '__main__':
main()
......@@ -3,192 +3,201 @@ from itertools import izip
import paddle.fluid as fluid
from paddleslim.teachers.bert.reader.cls import *
from paddleslim.nas.darts.search_space import AdaBERTClassifier
from paddleslim.nas.darts.architect_for_bert import Architect
from paddle.fluid.dygraph.base import to_variable
from tqdm import tqdm
import os
import pickle
import logging
from paddleslim.common import AvgrageMeter, get_logger
logger = get_logger(__name__, level=logging.INFO)
def count_parameters_in_MB(all_params):
parameters_number = 0
for param in all_params:
if param.trainable:
parameters_number += np.prod(param.shape)
return parameters_number / 1e6
def model_loss(model, data_ids):
# src_ids = data_ids[0]
# position_ids = data_ids[1]
# sentence_ids = data_ids[2]
# input_mask = data_ids[3]
labels = data_ids[4]
labels.stop_gradient = True
def valid_one_epoch(model, valid_loader, epoch, log_freq):
accs = AvgrageMeter()
ce_losses = AvgrageMeter()
model.student.eval()
enc_output = model(data_ids)
step_id = 0
for valid_data in valid_loader():
try:
loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch)
except:
loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=enc_output, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
return loss, accuracy
batch_size = valid_data[0].shape[0]
ce_losses.update(ce_loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
step_id += 1
return ce_losses.avg[0], accs.avg[0]
def train_one_epoch(model, architect, train_loader, valid_loader, optimizer,
epoch, use_data_parallel, log_freq):
ce_losses = AvgrageMeter()
def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel,
log_freq):
total_losses = AvgrageMeter()
accs = AvgrageMeter()
model.train()
ce_losses = AvgrageMeter()
kd_losses = AvgrageMeter()
model.student.train()
step_id = 0
for train_data, valid_data in izip(train_loader(), valid_loader):
architect.step(train_data, valid_data)
loss, acc = model_loss(model, train_data)
for train_data in train_loader():
batch_size = train_data[0].shape[0]
if use_data_parallel:
loss = model.scale_loss(loss)
loss.backward()
model.apply_collective_grads()
total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss(
train_data, epoch)
else:
loss.backward()
total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data,
epoch)
optimizer.minimize(loss)
if use_data_parallel:
total_loss = model.scale_loss(total_loss)
total_loss.backward()
model.apply_collective_grads()
else:
total_loss.backward()
optimizer.minimize(total_loss)
model.clear_gradients()
batch_size = train_data[0].shape[0]
ce_losses.update(loss.numpy(), batch_size)
total_losses.update(total_loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
ce_losses.update(ce_loss.numpy(), batch_size)
kd_losses.update(kd_loss.numpy(), batch_size)
if step_id % log_freq == 0:
logger.info(
"Train Epoch {}, Step {}, Lr {:.6f} loss {:.6f}; acc: {:.6f};".
"Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f};".
format(epoch, step_id,
optimizer.current_step_lr(), ce_losses.avg[0], accs.avg[
0]))
step_id += 1
def valid_one_epoch(model, valid_loader, epoch, log_freq):
ce_losses = AvgrageMeter()
accs = AvgrageMeter()
model.eval()
step_id = 0
for valid_data in valid_loader():
loss, acc = model_loss(model, valid_data)
batch_size = valid_data[0].shape[0]
ce_losses.update(loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
if step_id % log_freq == 0:
logger.info("Valid Epoch {}, Step {}, loss {:.6f}; acc: {:.6f};".
format(epoch, step_id, ce_losses.avg[0], accs.avg[0]))
optimizer.current_step_lr(), total_losses.avg[0],
ce_losses.avg[0], kd_losses.avg[0], accs.avg[0]))
step_id += 1
def main():
# whether use multi-gpus
use_data_parallel = False
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
).dev_id) if use_data_parallel else fluid.CUDAPlace(0)
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12"
bert_config_path = BERT_BASE_PATH + "/bert_config.json"
vocab_path = BERT_BASE_PATH + "/vocab.txt"
data_dir = "./data/glue_data/MNLI/"
teacher_model_dir = "./teacher_model/steps_23000"
num_samples = 392702
max_seq_len = 128
do_lower_case = True
batch_size = 128
# augmented dataset nums
# num_samples = 8016987
max_seq_len = 128
batch_size = 192
hidden_size = 768
emb_size = 768
max_layer = 8
epoch = 80
log_freq = 10
use_fixed_gumbel = True
processor = MnliProcessor(
data_dir=data_dir,
vocab_path=vocab_path,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
in_tokens=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase='search_train',
epoch=1,
dev_count=1,
shuffle=True)
val_reader = processor.data_generator(
batch_size=batch_size,
phase='search_valid',
epoch=1,
dev_count=1,
shuffle=True)
if use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
valid_reader = fluid.contrib.reader.distributed_batch_reader(
valid_reader)
task_name = 'mnli'
if task_name == 'mrpc':
data_dir = "./data/glue_data/MRPC/"
teacher_model_dir = "./data/teacher_model/mrpc"
num_samples = 3668
max_layer = 4
num_labels = 2
processor_func = MrpcProcessor
elif task_name == 'mnli':
data_dir = "./data/glue_data/MNLI/"
teacher_model_dir = "./data/teacher_model/steps_23000"
num_samples = 392702
max_layer = 8
num_labels = 3
processor_func = MnliProcessor
device_num = fluid.dygraph.parallel.Env().nranks
use_fixed_gumbel = True
train_phase = "train"
val_phase = "dev"
step_per_epoch = int(num_samples / (batch_size * device_num))
with fluid.dygraph.guard(place):
if use_fixed_gumbel:
# make sure gumbel arch is constant
np.random.seed(1)
fluid.default_main_program().random_seed = 1
model = AdaBERTClassifier(
3,
num_labels,
n_layer=max_layer,
hidden_size=hidden_size,
task_name=task_name,
emb_size=emb_size,
teacher_model=teacher_model_dir,
data_dir=data_dir,
use_fixed_gumbel=use_fixed_gumbel)
if use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
device_num = fluid.dygraph.parallel.Env().nranks
step_per_epoch = int(num_samples / (batch_size * device_num))
learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch)
model_parameters = [
p for p in model.parameters()
if p.name not in [a.name for a in model.arch_parameters()]
]
model_parameters = []
for p in model.parameters():
if (p.name not in [a.name for a in model.arch_parameters()] and
p.name not in
[a.name for a in model.teacher.parameters()]):
model_parameters.append(p)
clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
optimizer = fluid.optimizer.MomentumOptimizer(
learning_rate,
0.9,
regularization=fluid.regularizer.L2DecayRegularizer(3e-4),
parameter_list=model_parameters,
grad_clip=clip)
parameter_list=model_parameters)
processor = processor_func(
data_dir=data_dir,
vocab_path=vocab_path,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
in_tokens=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase=train_phase,
epoch=1,
dev_count=1,
shuffle=True)
dev_reader = processor.data_generator(
batch_size=batch_size,
phase=val_phase,
epoch=1,
dev_count=1,
shuffle=False)
if use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
train_loader = fluid.io.DataLoader.from_generator(
capacity=1024,
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
valid_loader = fluid.io.DataLoader.from_generator(
capacity=1024,
dev_loader = fluid.io.DataLoader.from_generator(
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
train_loader.set_batch_generator(train_reader, places=place)
valid_loader.set_batch_generator(val_reader, places=place)
dev_loader.set_batch_generator(dev_reader, places=place)
architect = Architect(model, learning_rate, 3e-4, place, False)
if use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
best_valid_acc = 0
for epoch_id in range(epoch):
train_one_epoch(model, architect, train_loader, valid_loader,
optimizer, epoch_id, use_data_parallel, log_freq)
valid_one_epoch(model, valid_loader, epoch_id, log_freq)
print(model.student._encoder.alphas.numpy())
print("=" * 100)
train_one_epoch(model, train_loader, optimizer, epoch_id,
use_data_parallel, log_freq)
loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq)
if acc > best_valid_acc:
best_valid_acc = acc
logger.info(
"dev set, ce_loss {:.6f}; acc {:.6f}, best_acc {:.6f};".format(
loss, acc, best_valid_acc))
if __name__ == '__main__':
......
import numpy as np
from itertools import izip
import paddle.fluid as fluid
from paddleslim.teachers.bert.reader.cls import *
from paddleslim.nas.darts.search_space import AdaBERTClassifier
from paddle.fluid.dygraph.base import to_variable
from tqdm import tqdm
import os
import pickle
import logging
from paddleslim.common import AvgrageMeter, get_logger
logger = get_logger(__name__, level=logging.INFO)
def valid_one_epoch(model, valid_loader, epoch, log_freq):
accs = AvgrageMeter()
ce_losses = AvgrageMeter()
model.student.eval()
step_id = 0
for valid_data in valid_loader():
try:
loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch)
except:
loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch)
batch_size = valid_data[0].shape[0]
ce_losses.update(ce_loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
step_id += 1
return ce_losses.avg[0], accs.avg[0]
def train_one_epoch(model, train_loader, valid_loader, optimizer,
arch_optimizer, epoch, use_data_parallel, log_freq):
total_losses = AvgrageMeter()
accs = AvgrageMeter()
ce_losses = AvgrageMeter()
kd_losses = AvgrageMeter()
val_accs = AvgrageMeter()
model.student.train()
step_id = 0
for train_data, valid_data in izip(train_loader(), valid_loader()):
batch_size = train_data[0].shape[0]
# make sure arch on every gpu is same, otherwise an error will occurs
np.random.seed(step_id * 2 * (epoch + 1))
if use_data_parallel:
total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss(
train_data, epoch)
else:
total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data,
epoch)
if use_data_parallel:
total_loss = model.scale_loss(total_loss)
total_loss.backward()
model.apply_collective_grads()
else:
total_loss.backward()
optimizer.minimize(total_loss)
model.clear_gradients()
total_losses.update(total_loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
ce_losses.update(ce_loss.numpy(), batch_size)
kd_losses.update(kd_loss.numpy(), batch_size)
# make sure arch on every gpu is same, otherwise an error will occurs
np.random.seed(step_id * 2 * (epoch + 1) + 1)
if use_data_parallel:
arch_loss, _, _, _, arch_logits = model._layers.loss(valid_data,
epoch)
else:
arch_loss, _, _, _, arch_logits = model.loss(valid_data, epoch)
if use_data_parallel:
arch_loss = model.scale_loss(arch_loss)
arch_loss.backward()
model.apply_collective_grads()
else:
arch_loss.backward()
arch_optimizer.minimize(arch_loss)
model.clear_gradients()
probs = fluid.layers.softmax(arch_logits[-1])
val_acc = fluid.layers.accuracy(input=probs, label=valid_data[4])
val_accs.update(val_acc.numpy(), batch_size)
if step_id % log_freq == 0:
logger.info(
"Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f}, search_valid_acc {:.6f};".
format(epoch, step_id,
optimizer.current_step_lr(), total_losses.avg[
0], ce_losses.avg[0], kd_losses.avg[0], accs.avg[0],
val_accs.avg[0]))
step_id += 1
def main():
# whether use multi-gpus
use_data_parallel = False
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
).dev_id) if use_data_parallel else fluid.CUDAPlace(0)
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12"
vocab_path = BERT_BASE_PATH + "/vocab.txt"
data_dir = "./data/glue_data/MNLI/"
teacher_model_dir = "./data/teacher_model/steps_23000"
do_lower_case = True
num_samples = 392702
# augmented dataset nums
# num_samples = 8016987
max_seq_len = 128
batch_size = 128
hidden_size = 768
emb_size = 768
max_layer = 8
epoch = 80
log_freq = 10
device_num = fluid.dygraph.parallel.Env().nranks
use_fixed_gumbel = False
train_phase = "search_train"
val_phase = "search_valid"
step_per_epoch = int(num_samples * 0.5 / ((batch_size) * device_num))
with fluid.dygraph.guard(place):
model = AdaBERTClassifier(
3,
n_layer=max_layer,
hidden_size=hidden_size,
emb_size=emb_size,
teacher_model=teacher_model_dir,
data_dir=data_dir,
use_fixed_gumbel=use_fixed_gumbel)
learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch)
model_parameters = []
for p in model.parameters():
if (p.name not in [a.name for a in model.arch_parameters()] and
p.name not in
[a.name for a in model.teacher.parameters()]):
model_parameters.append(p)
optimizer = fluid.optimizer.MomentumOptimizer(
learning_rate,
0.9,
regularization=fluid.regularizer.L2DecayRegularizer(3e-4),
parameter_list=model_parameters)
arch_optimizer = fluid.optimizer.Adam(
3e-4,
0.5,
0.999,
regularization=fluid.regularizer.L2Decay(1e-3),
parameter_list=model.arch_parameters())
processor = MnliProcessor(
data_dir=data_dir,
vocab_path=vocab_path,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
in_tokens=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase=train_phase,
epoch=1,
dev_count=1,
shuffle=True)
valid_reader = processor.data_generator(
batch_size=batch_size,
phase=val_phase,
epoch=1,
dev_count=1,
shuffle=True)
dev_reader = processor.data_generator(
batch_size=batch_size,
phase="dev",
epoch=1,
dev_count=1,
shuffle=False)
if use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
valid_reader = fluid.contrib.reader.distributed_batch_reader(
valid_reader)
train_loader = fluid.io.DataLoader.from_generator(
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
valid_loader = fluid.io.DataLoader.from_generator(
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
dev_loader = fluid.io.DataLoader.from_generator(
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
train_loader.set_batch_generator(train_reader, places=place)
valid_loader.set_batch_generator(valid_reader, places=place)
dev_loader.set_batch_generator(dev_reader, places=place)
if use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
model = fluid.dygraph.parallel.DataParallel(model, strategy)
for epoch_id in range(epoch):
train_one_epoch(model, train_loader, valid_loader, optimizer,
arch_optimizer, epoch_id, use_data_parallel,
log_freq)
loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq)
logger.info("dev set, ce_loss {:.6f}; acc: {:.6f};".format(loss,
acc))
if use_data_parallel:
print(model._layers.student._encoder.alphas.numpy())
else:
print(model.student._encoder.alphas.numpy())
print("=" * 100)
if __name__ == '__main__':
main()
......@@ -49,17 +49,17 @@ class Architect(object):
self.network_weight_decay),
parameter_list=self.unrolled_model_params)
def step(self, train_data, valid_data):
def step(self, train_data, valid_data, epoch):
if self.unrolled:
params_grads = self._backward_step_unrolled(train_data, valid_data)
self.optimizer.apply_gradients(params_grads)
else:
loss = self._backward_step(valid_data)
loss = self._backward_step(valid_data, epoch)
self.optimizer.minimize(loss)
self.optimizer.clear_gradients()
def _backward_step(self, valid_data):
loss = self.model.loss(valid_data)
def _backward_step(self, valid_data, epoch):
loss = self.model.loss(valid_data, epoch)
loss[0].backward()
return loss[0]
......
......@@ -31,6 +31,7 @@ import multiprocessing
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable, Layer, Linear
from paddle.fluid.dygraph.base import to_variable
from .reader.cls import *
from .model.bert import BertModelLayer
from .optimization import Optimizer
......@@ -48,6 +49,7 @@ class AdaBERTClassifier(Layer):
hidden_size=768,
gamma=0.8,
beta=4,
task_name='mnli',
conv_type="conv_bn",
search_layer=False,
teacher_model=None,
......@@ -68,17 +70,21 @@ class AdaBERTClassifier(Layer):
self._teacher_model = teacher_model
self._data_dir = data_dir
self.use_fixed_gumbel = use_fixed_gumbel
self.T = t
print(
"----------------------load teacher model and test----------------------------------------"
)
self.teacher = BERTClassifier(
num_labels, model_path=self._teacher_model)
num_labels, task_name=task_name, model_path=self._teacher_model)
# global setting, will be overwritten when training(about 1% acc loss)
self.teacher.eval()
self.teacher.test(self._data_dir)
print(
"----------------------finish load teacher model and test----------------------------------------"
)
self.student = BertModelLayer(
num_labels=num_labels,
n_layer=self._n_layer,
emb_size=self._emb_size,
hidden_size=self._hidden_size,
......@@ -87,6 +93,7 @@ class AdaBERTClassifier(Layer):
use_fixed_gumbel=self.use_fixed_gumbel,
gumbel_alphas=gumbel_alphas)
fix_emb = False
for s_emb, t_emb in zip(self.student.emb_names(),
self.teacher.emb_names()):
t_emb.stop_gradient = True
......@@ -100,91 +107,58 @@ class AdaBERTClassifier(Layer):
"Assigned embedding[{}] from teacher to embedding[{}] in student.".
format(t_emb.name, s_emb.name))
self.cls_fc = list()
for i in range(self._n_layer):
fc = Linear(
input_dim=self._hidden_size,
output_dim=self._num_labels,
param_attr=fluid.ParamAttr(
name="s_cls_out_%d_w" % i,
initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
bias_attr=fluid.ParamAttr(
name="s_cls_out_%d_b" % i,
initializer=fluid.initializer.Constant(0.)))
fc = self.add_sublayer("cls_fc_%d" % i, fc)
self.cls_fc.append(fc)
def forward(self, data_ids):
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
return self.student(src_ids, position_ids, sentence_ids)
def forward(self, data_ids, epoch):
return self.student(data_ids, epoch)
def arch_parameters(self):
return self.student.arch_parameters()
def genotype(self):
return self.arch_parameters()
def ce(self, logits):
logits = np.exp(logits - np.max(logits))
logits = logits / logits.sum(axis=0)
return logits
def loss(self, data_ids):
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
input_mask = data_ids[3]
def loss(self, data_ids, epoch):
labels = data_ids[4]
s_logits = self.student(
src_ids, position_ids, sentence_ids, flops=[], model_size=[])
s_logits = self.student(data_ids, epoch)
self.teacher.eval()
total_loss, t_logits, t_losses, accuracys, num_seqs = self.teacher(
data_ids)
# define kd loss
kd_losses = []
t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids)
#define kd loss
kd_weights = []
for i in range(len(s_logits)):
j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
kd_weights.append(t_losses[j].numpy())
kd_weights = np.array(kd_weights)
kd_weights = self.ce(-kd_weights)
s_probs = None
kd_weights = np.squeeze(kd_weights)
kd_weights = to_variable(kd_weights)
kd_weights = fluid.layers.softmax(-kd_weights)
kd_losses = []
for i in range(len(s_logits)):
j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
t_logit = t_logits[j]
s_logit = s_logits[i]
t_logit.stop_gradient = True
t_probs = fluid.layers.softmax(t_logit / self.T)
s_probs = fluid.layers.softmax(s_logit)
t_probs = fluid.layers.softmax(t_logit) # P_j^T
s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S
#kd_loss = -t_probs * fluid.layers.log(s_probs)
kd_loss = fluid.layers.cross_entropy(
input=s_probs, label=t_probs, soft_label=True)
kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1)
kd_loss = fluid.layers.mean(kd_loss)
# print("kd_loss[{}] = {}; kd_weights[{}] = {}".format(i, kd_loss.numpy(), i, kd_weights[i]))
# tmp = kd_loss * kd_weights[i]
tmp = fluid.layers.scale(kd_loss, scale=kd_weights[i])
# print("kd_loss[{}] = {}".format(i, tmp.numpy()))
kd_losses.append(tmp)
kd_loss = fluid.layers.reduce_mean(kd_loss)
kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i])
kd_losses.append(kd_loss)
kd_loss = fluid.layers.sum(kd_losses)
# print("kd_loss = {}".format(kd_loss.numpy()))
losses = []
for logit in s_logits:
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logit, label=labels, return_softmax=True)
loss = fluid.layers.mean(x=ce_loss)
losses.append(loss)
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
ce_loss = fluid.layers.sum(losses)
ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=s_logits[-1], label=labels, return_softmax=True)
ce_loss = fluid.layers.mean(x=ce_loss)
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss
loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss
# return ce_loss, accuracy, None, None
return loss, accuracy, ce_loss, kd_loss
return total_loss, accuracy, ce_loss, kd_loss, s_logits
......@@ -32,6 +32,7 @@ from .transformer_encoder import EncoderLayer
class BertModelLayer(Layer):
def __init__(self,
num_labels,
emb_size=128,
hidden_size=768,
n_layer=12,
......@@ -91,6 +92,7 @@ class BertModelLayer(Layer):
param_attr=fluid.ParamAttr(name="s_emb_factorization"))
self._encoder = EncoderLayer(
num_labels=num_labels,
n_layer=self._n_layer,
hidden_size=self._hidden_size,
search_layer=self._search_layer,
......@@ -101,6 +103,10 @@ class BertModelLayer(Layer):
return self._src_emb.parameters() + self._pos_emb.parameters(
) + self._sent_emb.parameters()
def emb_names(self):
return self._src_emb.parameters() + self._pos_emb.parameters(
) + self._sent_emb.parameters()
def max_flops(self):
return self._encoder.max_flops
......@@ -110,54 +116,19 @@ class BertModelLayer(Layer):
def arch_parameters(self):
return [self._encoder.alphas] #, self._encoder.k]
def forward(self,
src_ids,
position_ids,
sentence_ids,
flops=[],
model_size=[]):
def forward(self, data_ids, epoch):
"""
forward
"""
ids = np.squeeze(src_ids.numpy())
sids = np.squeeze(sentence_ids.numpy())
batchsize = ids.shape[0]
ids_0 = ids[((sids == 0) & (ids != 0))]
seqlen_0 = ((sids == 0) & (ids != 0)).astype(np.int64).sum(1)
y_0 = np.concatenate([np.arange(s) for s in seqlen_0])
x_0 = np.concatenate([
np.ones(
[s], dtype=np.int64) * i for i, s in enumerate(seqlen_0)
])
ids0 = np.zeros([batchsize, seqlen_0.max()], dtype=np.int64)
ids0[(x_0, y_0)] = ids_0
ids_1 = ids[(sids == 1) & (ids != 0)]
seqlen_1 = ((sids == 1) & (ids != 0)).astype(np.int64).sum(1)
y_1 = np.concatenate([np.arange(s) for s in seqlen_1])
x_1 = np.concatenate([
np.ones(
[s], dtype=np.int64) * i for i, s in enumerate(seqlen_1)
])
ids1 = np.zeros([batchsize, seqlen_1.max()], dtype=np.int64)
ids1[(x_1, y_1)] = ids_1
msl = max(seqlen_0.max(), seqlen_1.max())
ids0 = np.pad(ids0, [[0, 0], [0, msl - seqlen_0.max()]],
mode='constant')
ids1 = np.pad(ids1, [[0, 0], [0, msl - seqlen_1.max()]],
mode='constant')
ids0 = fluid.dygraph.to_variable(ids0)
ids1 = fluid.dygraph.to_variable(ids1)
ids0 = data_ids[5]
ids1 = data_ids[6]
src_emb_0 = self._src_emb(ids0)
src_emb_1 = self._src_emb(ids1)
emb_out_0 = self._emb_fac(src_emb_0)
emb_out_1 = self._emb_fac(src_emb_1)
# (bs, seq_len, 768)
# (bs, seq_len, hidden_size)
enc_outputs = self._encoder(emb_out_0, emb_out_1, epoch)
enc_outputs = self._encoder(
emb_out, flops=flops, model_size=model_size)
return enc_outputs
......@@ -23,14 +23,15 @@ from collections import Iterable
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, Conv2D, BatchNorm, Pool2D, to_variable
from paddle.fluid.dygraph import to_variable
from paddle.fluid.initializer import NormalInitializer
from paddle.fluid import ParamAttr
from paddle.fluid.initializer import MSRA, ConstantInitializer
ConvBN_PRIMITIVES = [
'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3',
'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3', 'none',
'skip_connect'
'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3',
'skip_connect', 'none'
]
......@@ -53,11 +54,6 @@ class MixedOp(fluid.dygraph.Layer):
def __init__(self, n_channel, name=None):
super(MixedOp, self).__init__()
PRIMITIVES = ConvBN_PRIMITIVES
# ops = [
# OPS[primitive](n_channel, name
# if name is None else name + "/" + primitive)
# for primitive in PRIMITIVES
# ]
ops = []
for primitive in PRIMITIVES:
op = OPS[primitive](n_channel, name
......@@ -76,26 +72,17 @@ class MixedOp(fluid.dygraph.Layer):
self._ops = fluid.dygraph.LayerList(ops)
def forward(self, x, weights):
#out = weights[0] * self._ops[0](x)
# out = fluid.layers.sums(
# [weights[i] * op(x) for i, op in enumerate(self._ops)])
# [weights[i] * op(x) for i, op in enumerate(self._ops)])
# return out
for i in range(len(self._ops)):
if isinstance(weights, Iterable):
weights_i = weights[i]
else:
weights_i = weights[i].numpy()
if weights_i != 0:
for i in range(len(weights.numpy())):
if weights[i].numpy() != 0:
return self._ops[i](x) * weights[i]
def gumbel_softmax(logits, temperature=1, hard=True, eps=1e-10):
#U = np.random.uniform(0, 1, logits.shape)
#U = - to_variable(
# np.log(-np.log(U + eps) + eps).astype("float32"))
def gumbel_softmax(logits, epoch, temperature=1.0, hard=True, eps=1e-10):
temperature = temperature * (0.98**epoch)
U = np.random.gumbel(0, 1, logits.shape).astype("float32")
logits = logits + to_variable(U)
......@@ -105,12 +92,12 @@ def gumbel_softmax(logits, temperature=1, hard=True, eps=1e-10):
if hard:
maxes = fluid.layers.reduce_max(logits, dim=1, keep_dim=True)
hard = fluid.layers.cast((logits == maxes), logits.dtype)
# out = hard - logits.detach() + logits
tmp = hard - logits
tmp.stop_gradient = True
out = tmp + logits
out = hard - logits.detach() + logits
# tmp.stop_gradient = True
# out = tmp + logits
else:
out = logits
return out
......@@ -142,8 +129,6 @@ class ReluConvBN(fluid.dygraph.Layer):
use_cudnn=True,
name=None):
super(ReluConvBN, self).__init__()
#conv_std = (2.0 /
# (filter_size[0] * filter_size[1] * out_c * in_c))**0.5
conv_param = fluid.ParamAttr(
name=name if name is None else (name + "_conv.weights"),
initializer=fluid.initializer.MSRA())
......@@ -215,6 +200,7 @@ class EncoderLayer(Layer):
"""
def __init__(self,
num_labels,
n_layer,
hidden_size=768,
name="encoder",
......@@ -224,12 +210,27 @@ class EncoderLayer(Layer):
super(EncoderLayer, self).__init__()
self._n_layer = n_layer
self._hidden_size = hidden_size
self._n_channel = 256
self._n_channel = 128
self._steps = 3
self._n_ops = len(ConvBN_PRIMITIVES)
self.use_fixed_gumbel = use_fixed_gumbel
self.stem = fluid.dygraph.Sequential(
self.stem0 = fluid.dygraph.Sequential(
Conv2D(
num_channels=1,
num_filters=self._n_channel,
filter_size=[3, self._hidden_size],
padding=[1, 0],
param_attr=fluid.ParamAttr(initializer=MSRA()),
bias_attr=False),
BatchNorm(
num_channels=self._n_channel,
param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1)),
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0))))
self.stem1 = fluid.dygraph.Sequential(
Conv2D(
num_channels=1,
num_filters=self._n_channel,
......@@ -262,16 +263,10 @@ class EncoderLayer(Layer):
default_initializer=NormalInitializer(
loc=0.0, scale=1e-3))
# self.k = fluid.layers.create_parameter(
# shape=[1, self._n_layer],
# dtype="float32",
# default_initializer=NormalInitializer(
# loc=0.0, scale=1e-3))
self.pool2d_avg = Pool2D(pool_type='avg', global_pooling=True)
self.bns = []
self.outs = []
for i in range(self._n_layer):
bn = BatchNorm(
num_channels=self._n_channel,
param_attr=fluid.ParamAttr(
......@@ -280,52 +275,53 @@ class EncoderLayer(Layer):
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0),
trainable=False))
self.bns.append(bn)
out = Linear(
self._n_channel,
3,
num_labels,
param_attr=ParamAttr(initializer=MSRA()),
bias_attr=ParamAttr(initializer=MSRA()))
self.bns.append(bn)
self.outs.append(out)
self._bns = fluid.dygraph.LayerList(self.bns)
self._outs = fluid.dygraph.LayerList(self.outs)
self.use_fixed_gumbel = use_fixed_gumbel
self.gumbel_alphas = gumbel_softmax(self.alphas)
if gumbel_alphas is not None:
self.gumbel_alphas = np.array(gumbel_alphas).reshape(
self.alphas.shape)
else:
self.gumbel_alphas = gumbel_softmax(self.alphas)
self.gumbel_alphas.stop_gradient = True
#self.gumbel_alphas = gumbel_softmax(self.alphas, 0).detach()
mrpc_arch = [
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # std_conv7 0 # node 0
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # dil_conv5 1
[0, 0, 1, 0, 0, 0, 0, 0, 0, 0], # std_conv7 0 # node 1
[0, 0, 0, 0, 1, 0, 0, 0, 0, 0], # dil_conv5 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 2
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 0 # node2
[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], # std_conv3 1
[0, 0, 0, 0, 0, 0, 0, 0, 0, 1], # zero 2
[0, 0, 0, 1, 0, 0, 0, 0, 0, 0] # dil_conv3 3
]
self.gumbel_alphas = to_variable(
np.array(mrpc_arch).astype(np.float32))
self.gumbel_alphas.stop_gradient = True
print("gumbel_alphas: \n", self.gumbel_alphas.numpy())
def forward(self, enc_input_0, enc_input_1, epoch, flops=[],
model_size=[]):
alphas = self.gumbel_alphas if self.use_fixed_gumbel else gumbel_softmax(
self.alphas, epoch)
print("gumbel_alphas: {}".format(self.gumbel_alphas))
s0 = fluid.layers.unsqueeze(enc_input_0, [1])
s1 = fluid.layers.unsqueeze(enc_input_1, [1])
s0 = self.stem0(s0)
s1 = self.stem1(s1)
def forward(self, enc_input_0, enc_input_1, flops=[], model_size=[]):
alphas = self.gumbel_alphas if self.use_fixed_gumbel else gumbel_softmax(
self.alphas)
s0 = fluid.layers.reshape(
enc_input_0, [-1, 1, enc_input_0.shape[1], enc_input_0.shape[2]])
s1 = fluid.layers.reshape(
enc_input_1, [-1, 1, enc_input_1.shape[1], enc_input_1.shape[2]])
# (bs, 1, seq_len, hidden_size)
s0 = self.stem(s0)
s1 = self.stem(s1)
# (bs, n_channel, seq_len, 1)
if self.use_fixed_gumbel:
alphas = self.gumbel_alphas
else:
alphas = gumbel_softmax(self.alphas)
s0 = s1 = tmp
outputs = []
enc_outputs = []
for i in range(self._n_layer):
s0, s1 = s1, self._cells[i](s0, s1, alphas)
tmp = self.bns[i](s1)
tmp = self.pool2d_avg(tmp)
# (bs, n_channel, seq_len, 1)
tmp = self._bns[i](s1)
tmp = self.pool2d_avg(tmp)
tmp = fluid.layers.reshape(tmp, shape=[-1, 0])
tmp = self.outs[i](tmp)
outputs.append(tmp)
return outputs
tmp = self._outs[i](tmp)
enc_outputs.append(tmp)
return enc_outputs
......@@ -58,7 +58,8 @@ class BERTClassifier(Layer):
num_labels,
task_name="mnli",
model_path=None,
use_cuda=True):
use_cuda=True,
return_pooled_out=True):
super(BERTClassifier, self).__init__()
self.task_name = task_name.lower()
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12/"
......@@ -84,7 +85,7 @@ class BERTClassifier(Layer):
}
self.cls_model = ClsModelLayer(
self.bert_config, num_labels, return_pooled_out=True)
self.bert_config, num_labels, return_pooled_out=return_pooled_out)
if model_path is not None:
#restore the model
......
......@@ -46,6 +46,7 @@ class ClsModelLayer(Layer):
self.use_fp16 = use_fp16
self.loss_scaling = loss_scaling
self.n_layers = config['num_hidden_layers']
self.return_pooled_out = return_pooled_out
self.bert_layer = BertModelLayer(
config=self.config, return_pooled_out=True, use_fp16=self.use_fp16)
......@@ -79,11 +80,23 @@ class ClsModelLayer(Layer):
enc_outputs, next_sent_feats = self.bert_layer(
src_ids, position_ids, sentence_ids, input_mask)
if not self.return_pooled_out:
cls_feat = fluid.layers.dropout(
x=next_sent_feats[-1],
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
logits = self.cls_fc[-1](cls_feat)
probs = fluid.layers.softmax(logits)
num_seqs = fluid.layers.create_tensor(dtype='int64')
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
return enc_outputs, logits, accuracy, num_seqs
logits = []
losses = []
accuracys = []
for next_sent_feat, fc in zip(next_sent_feats, self.cls_fc):
cls_feat = fluid.layers.dropout(
x=next_sent_feat,
dropout_prob=0.1,
......
......@@ -16,6 +16,7 @@ import io
import os
import types
import csv
import random
import numpy as np
from . import tokenization
from .batching import prepare_batch_data
......@@ -139,6 +140,8 @@ class DataProcessor(object):
epoch: int. Total epoches to generate data.
shuffle: bool. Whether to shuffle examples.
"""
search_examples = self.get_train_examples(self.data_dir)
random.shuffle(search_examples)
if phase == 'train':
examples = self.get_train_examples(self.data_dir)
self.num_examples['train'] = len(examples)
......@@ -152,13 +155,13 @@ class DataProcessor(object):
examples = self.get_test_examples(self.data_dir)
self.num_examples['test'] = len(examples)
elif phase == 'search_train':
examples = self.get_train_examples(self.data_dir)
self.num_examples['search_train'] = len(examples) / 2
examples = examples[:self.num_examples['search_train']]
#examples = self.get_train_examples(self.data_dir)
self.num_examples['search_train'] = len(search_examples) / 2
examples = search_examples[:self.num_examples['search_train']]
elif phase == 'search_valid':
examples = self.get_train_examples(self.data_dir)
self.num_examples['search_valid'] = len(examples) / 2
examples = examples[self.num_examples['search_train']:]
#examples = self.get_train_examples(self.data_dir)
self.num_examples['search_valid'] = len(search_examples) / 2
examples = search_examples[self.num_examples['search_valid']:]
else:
raise ValueError(
"Unknown phase, which should be in ['train', 'dev', 'test'].")
......@@ -213,16 +216,53 @@ class DataProcessor(object):
return_input_mask=True,
return_max_len=False,
return_num_token=False)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
batch = self.split_seq_pair(batch)
yield batch
all_dev_batches = []
return wrapper
def split_seq_pair(self, data_ids):
src_ids = data_ids[0]
sentence_ids = data_ids[2]
ids = np.squeeze(src_ids)
sids = np.squeeze(sentence_ids)
batchsize = ids.shape[0]
ids_0 = ids[((sids == 0) & (ids != 0))]
seqlen_0 = ((sids == 0) & (ids != 0)).astype(np.int64).sum(1)
y_0 = np.concatenate([np.arange(s) for s in seqlen_0])
x_0 = np.concatenate([
np.ones(
[s], dtype=np.int64) * i for i, s in enumerate(seqlen_0)
])
ids0 = np.zeros([batchsize, seqlen_0.max()], dtype=np.int64)
ids0[(x_0, y_0)] = ids_0
ids_1 = ids[(sids == 1) & (ids != 0)]
seqlen_1 = ((sids == 1) & (ids != 0)).astype(np.int64).sum(1)
y_1 = np.concatenate([np.arange(s) for s in seqlen_1])
x_1 = np.concatenate([
np.ones(
[s], dtype=np.int64) * i for i, s in enumerate(seqlen_1)
])
ids1 = np.zeros([batchsize, seqlen_1.max()], dtype=np.int64)
ids1[(x_1, y_1)] = ids_1
msl = max(seqlen_0.max(), seqlen_1.max())
ids0 = np.pad(ids0, [[0, 0], [0, msl - seqlen_0.max()]],
mode='constant')
ids1 = np.pad(ids1, [[0, 0], [0, msl - seqlen_1.max()]],
mode='constant')
return data_ids + [ids0, ids1]
class InputExample(object):
"""A single training/test example for simple sequence classification."""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册