提交 3ff0613a 编写于 作者: B baiyfbupt

refine reader

上级 cc17334b
......@@ -3,6 +3,10 @@ from itertools import izip
import paddle.fluid as fluid
from paddleslim.teachers.bert.reader.cls import *
from paddleslim.nas.darts.search_space import AdaBERTClassifier
from paddle.fluid.dygraph.base import to_variable
from tqdm import tqdm
import os
import pickle
import logging
from paddleslim.common import AvgrageMeter, get_logger
......@@ -17,6 +21,35 @@ def count_parameters_in_MB(all_params):
return parameters_number / 1e6
def preprocess_data(data_generator, data_nums, phase, cached_data):
t = tqdm(total=data_nums)
data_list = []
for data in tqdm(data_generator()):
# data_var = []
# for d in data:
# tmp = fluid.core.LoDTensor()
# tmp.set(d, fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id))
# data_var.append(tmp)
data_list.append(data)
t.update(data[0].shape[0])
t.close()
logger.info("Saving {} data to {}".format(phase, cached_data + phase))
f = open(cached_data + phase, 'wb')
pickle.dump(data_list, f)
f.close()
return data_list
def generator_reader(data_list):
def wrapper():
for d in data_list:
yield d
return wrapper
def train_one_epoch(model, train_loader, valid_loader, optimizer,
arch_optimizer, epoch, use_data_parallel, log_freq):
total_losses = AvgrageMeter()
......@@ -31,6 +64,8 @@ def train_one_epoch(model, train_loader, valid_loader, optimizer,
#for train_data in train_loader():
batch_size = train_data[0].shape[0]
# make sure arch on every gpu is same
np.random.seed(step_id * 2)
try:
total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss(
train_data, epoch)
......@@ -51,6 +86,8 @@ def train_one_epoch(model, train_loader, valid_loader, optimizer,
ce_losses.update(ce_loss.numpy(), batch_size)
kd_losses.update(kd_loss.numpy(), batch_size)
# make sure arch on every gpu is same
np.random.seed(step_id * 2 + 1)
try:
arch_loss, _, _, _, arch_logits = model._layers.loss(valid_data,
epoch)
......@@ -95,29 +132,27 @@ def valid_one_epoch(model, valid_loader, epoch, log_freq):
ce_losses.update(ce_loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size)
# if step_id % log_freq == 0:
# logger.info("Valid Epoch {}, Step {}, ce_loss {:.6f}; acc: {:.6f};".
# format(epoch, step_id, ce_losses.avg[0], accs.avg[0]))
step_id += 1
return ce_losses.avg[0], accs.avg[0]
def main():
# whether use multi-gpus
use_data_parallel = True
use_data_parallel = False
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
).dev_id) if use_data_parallel else fluid.CUDAPlace(0)
BERT_BASE_PATH = "./data/pretrained_models/uncased_L-12_H-768_A-12"
bert_config_path = BERT_BASE_PATH + "/bert_config.json"
vocab_path = BERT_BASE_PATH + "/vocab.txt"
data_dir = "./data/glue_data/MNLI/"
cached_data = "./data/glue_data/MNLI/cached_data_"
teacher_model_dir = "./data/teacher_model/steps_23000"
do_lower_case = True
#num_samples = 392702
num_samples = 8016987
max_seq_len = 128
batch_size = 64
# any modify of vocab/do_lower_case/max_seq_len requires update cached data
batch_size = 128
hidden_size = 768
emb_size = 768
max_layer = 8
......@@ -157,13 +192,11 @@ def main():
[a.name for a in model.teacher.parameters()]):
model_parameters.append(p)
#clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)
optimizer = fluid.optimizer.MomentumOptimizer(
learning_rate,
0.9,
regularization=fluid.regularizer.L2DecayRegularizer(3e-4),
parameter_list=model_parameters)
# grad_clip=clip)
arch_optimizer = fluid.optimizer.Adam(
3e-4,
......@@ -172,29 +205,69 @@ def main():
regularization=fluid.regularizer.L2Decay(1e-3),
parameter_list=model.arch_parameters())
processor = MnliProcessor(
data_dir=data_dir,
vocab_path=vocab_path,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
in_tokens=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase=train_phase,
epoch=1,
dev_count=1,
shuffle=True)
valid_reader = processor.data_generator(
batch_size=batch_size,
phase=val_phase,
epoch=1,
dev_count=1,
shuffle=True)
print("train_data nums:", processor.get_num_examples(train_phase))
print("valid_data nums:", processor.get_num_examples(val_phase))
print("dev_data nums:", processor.get_num_examples("dev"))
if os.path.exists(cached_data + "train") and os.path.exists(
cached_data + "valid") + os.path.exists(cached_data + "dev"):
f = open(cached_data + "train", 'rb')
logger.info("loading preprocessed train data from {}".format(
cached_data + "train"))
train_data_list = pickle.load(f)
f.close()
f = open(cached_data + "valid", 'rb')
logger.info("loading preprocessed valid data from {}".format(
cached_data + "valid"))
valid_data_list = pickle.load(f)
f.close()
f = open(cached_data + "dev", 'rb')
logger.info("loading preprocessed dev data from {}".format(
cached_data + "dev"))
dev_data_list = pickle.load(f)
f.close()
else:
processor = MnliProcessor(
data_dir=data_dir,
vocab_path=vocab_path,
max_seq_len=max_seq_len,
do_lower_case=do_lower_case,
in_tokens=False)
train_reader = processor.data_generator(
batch_size=batch_size,
phase=train_phase,
epoch=1,
dev_count=1,
shuffle=True)
valid_reader = processor.data_generator(
batch_size=batch_size,
phase=val_phase,
epoch=1,
dev_count=1,
shuffle=True)
dev_reader = processor.data_generator(
batch_size=batch_size,
phase="dev",
epoch=1,
dev_count=1,
shuffle=False)
train_data_nums = processor.get_num_examples(train_phase)
valid_data_nums = processor.get_num_examples(val_phase)
dev_data_nums = processor.get_num_examples("dev")
logger.info("Preprocessing train data")
train_data_list = preprocess_data(train_reader, train_data_nums,
"train", cached_data)
logger.info("Preprocessing valid data")
valid_data_list = preprocess_data(valid_reader, valid_data_nums,
"valid", cached_data)
logger.info("Preprocessing dev data")
dev_data_list = preprocess_data(dev_reader, dev_data_nums, "dev",
cached_data)
train_reader = generator_reader(train_data_list)
valid_reader = generator_reader(valid_data_list)
dev_reader = generator_reader(dev_data_list)
if use_data_parallel:
train_reader = fluid.contrib.reader.distributed_batch_reader(
......@@ -202,25 +275,18 @@ def main():
valid_reader = fluid.contrib.reader.distributed_batch_reader(
valid_reader)
dev_reader = processor.data_generator(
batch_size=batch_size,
phase="dev",
epoch=1,
dev_count=1,
shuffle=False)
train_loader = fluid.io.DataLoader.from_generator(
capacity=512,
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
valid_loader = fluid.io.DataLoader.from_generator(
capacity=512,
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
dev_loader = fluid.io.DataLoader.from_generator(
capacity=512,
capacity=128,
use_double_buffer=True,
iterable=True,
return_list=True)
......
......@@ -114,13 +114,13 @@ class AdaBERTClassifier(Layer):
return logits
def loss(self, data_ids, epoch):
src_ids = data_ids[0]
position_ids = data_ids[1]
sentence_ids = data_ids[2]
input_mask = data_ids[3]
# src_ids = data_ids[0]
# position_ids = data_ids[1]
# sentence_ids = data_ids[2]
# input_mask = data_ids[3]
labels = data_ids[4]
s_logits = self.student(src_ids, position_ids, sentence_ids, epoch)
s_logits = self.student(data_ids, epoch)
t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids)
......
......@@ -108,48 +108,12 @@ class BertModelLayer(Layer):
def arch_parameters(self):
return [self._encoder.alphas] #, self._encoder.k]
def forward(self,
src_ids,
position_ids,
sentence_ids,
epoch,
flops=[],
model_size=[]):
def forward(self, data_ids, epoch):
"""
forward
"""
ids = np.squeeze(src_ids.numpy())
sids = np.squeeze(sentence_ids.numpy())
batchsize = ids.shape[0]
ids_0 = ids[((sids == 0) & (ids != 0))]
seqlen_0 = ((sids == 0) & (ids != 0)).astype(np.int64).sum(1)
y_0 = np.concatenate([np.arange(s) for s in seqlen_0])
x_0 = np.concatenate([
np.ones(
[s], dtype=np.int64) * i for i, s in enumerate(seqlen_0)
])
ids0 = np.zeros([batchsize, seqlen_0.max()], dtype=np.int64)
ids0[(x_0, y_0)] = ids_0
ids_1 = ids[(sids == 1) & (ids != 0)]
seqlen_1 = ((sids == 1) & (ids != 0)).astype(np.int64).sum(1)
y_1 = np.concatenate([np.arange(s) for s in seqlen_1])
x_1 = np.concatenate([
np.ones(
[s], dtype=np.int64) * i for i, s in enumerate(seqlen_1)
])
ids1 = np.zeros([batchsize, seqlen_1.max()], dtype=np.int64)
ids1[(x_1, y_1)] = ids_1
msl = max(seqlen_0.max(), seqlen_1.max())
ids0 = np.pad(ids0, [[0, 0], [0, msl - seqlen_0.max()]],
mode='constant')
ids1 = np.pad(ids1, [[0, 0], [0, msl - seqlen_1.max()]],
mode='constant')
ids0 = fluid.dygraph.to_variable(ids0)
ids1 = fluid.dygraph.to_variable(ids1)
ids0 = data_ids[5]
ids1 = data_ids[6]
src_emb_0 = self._src_emb(ids0)
src_emb_1 = self._src_emb(ids1)
......@@ -157,7 +121,6 @@ class BertModelLayer(Layer):
emb_out_1 = self._emb_fac(src_emb_1)
# (bs, seq_len, hidden_size)
enc_outputs = self._encoder(
emb_out_0, emb_out_1, epoch, flops=flops, model_size=model_size)
enc_outputs = self._encoder(emb_out_0, emb_out_1, epoch)
return enc_outputs
......@@ -70,14 +70,13 @@ class MixedOp(fluid.dygraph.Layer):
self._ops = fluid.dygraph.LayerList(ops)
def forward(self, x, weights, index):
out = fluid.layers.sums(
[weights[i] * op(x) for i, op in enumerate(self._ops)])
return out
# out = fluid.layers.sums(
# [weights[i] * op(x) for i, op in enumerate(self._ops)])
# return out
# causebug in multi-gpus
#for i in range(len(self._ops)):
# if weights[i].numpy() != 0:
# return self._ops[i](x) * weights[i]
for i in range(len(self._ops)):
if weights[i].numpy() != 0:
return self._ops[i](x) * weights[i]
def gumbel_softmax(logits, epoch, temperature=1.0, hard=True, eps=1e-10):
......
......@@ -209,16 +209,53 @@ class DataProcessor(object):
return_input_mask=True,
return_max_len=False,
return_num_token=False)
if len(all_dev_batches) < dev_count:
all_dev_batches.append(batch_data)
if len(all_dev_batches) == dev_count:
for batch in all_dev_batches:
batch = self.split_seq_pair(batch)
yield batch
all_dev_batches = []
return wrapper
def split_seq_pair(self, data_ids):
src_ids = data_ids[0]
sentence_ids = data_ids[2]
ids = np.squeeze(src_ids)
sids = np.squeeze(sentence_ids)
batchsize = ids.shape[0]
ids_0 = ids[((sids == 0) & (ids != 0))]
seqlen_0 = ((sids == 0) & (ids != 0)).astype(np.int64).sum(1)
y_0 = np.concatenate([np.arange(s) for s in seqlen_0])
x_0 = np.concatenate([
np.ones(
[s], dtype=np.int64) * i for i, s in enumerate(seqlen_0)
])
ids0 = np.zeros([batchsize, seqlen_0.max()], dtype=np.int64)
ids0[(x_0, y_0)] = ids_0
ids_1 = ids[(sids == 1) & (ids != 0)]
seqlen_1 = ((sids == 1) & (ids != 0)).astype(np.int64).sum(1)
y_1 = np.concatenate([np.arange(s) for s in seqlen_1])
x_1 = np.concatenate([
np.ones(
[s], dtype=np.int64) * i for i, s in enumerate(seqlen_1)
])
ids1 = np.zeros([batchsize, seqlen_1.max()], dtype=np.int64)
ids1[(x_1, y_1)] = ids_1
msl = max(seqlen_0.max(), seqlen_1.max())
ids0 = np.pad(ids0, [[0, 0], [0, msl - seqlen_0.max()]],
mode='constant')
ids1 = np.pad(ids1, [[0, 0], [0, msl - seqlen_1.max()]],
mode='constant')
return data_ids + [ids0, ids1]
class InputExample(object):
"""A single training/test example for simple sequence classification."""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册