提交 9240f31c 编写于 作者: B baiyfbupt

update basekd

上级 4233d6a8
...@@ -10,26 +10,34 @@ import pickle ...@@ -10,26 +10,34 @@ import pickle
import logging import logging
from paddleslim.common import AvgrageMeter, get_logger from paddleslim.common import AvgrageMeter, get_logger
from paddleslim.nas.darts import count_parameters_in_MB
logger = get_logger(__name__, level=logging.INFO) logger = get_logger(__name__, level=logging.INFO)
def valid_one_epoch(model, valid_loader, epoch, log_freq): def valid_one_epoch(model, valid_loader, epoch, log_freq):
accs = AvgrageMeter() accs = AvgrageMeter()
ce_losses = AvgrageMeter() ce_losses = AvgrageMeter()
model.student.eval() t_accs = AvgrageMeter()
model.eval()
step_id = 0 step_id = 0
for valid_data in valid_loader(): for valid_data in valid_loader():
try: try:
loss, acc, ce_loss, _, _ = model._layers.loss(valid_data, epoch) loss, acc, ce_loss, _, _, t_acc = model._layers.loss(valid_data,
epoch)
except: except:
loss, acc, ce_loss, _, _ = model.loss(valid_data, epoch) loss, acc, ce_loss, _, _, t_acc = model.loss(valid_data, epoch)
batch_size = valid_data[0].shape[0] batch_size = valid_data[0].shape[0]
ce_losses.update(ce_loss.numpy(), batch_size) ce_losses.update(ce_loss.numpy(), batch_size)
accs.update(acc.numpy(), batch_size) accs.update(acc.numpy(), batch_size)
t_accs.update(t_acc.numpy(), batch_size)
step_id += 1 step_id += 1
return ce_losses.avg[0], accs.avg[0]
return ce_losses.avg[0], accs.avg[0], t_accs.avg[0]
def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel, def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel,
...@@ -38,18 +46,19 @@ def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel, ...@@ -38,18 +46,19 @@ def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel,
accs = AvgrageMeter() accs = AvgrageMeter()
ce_losses = AvgrageMeter() ce_losses = AvgrageMeter()
kd_losses = AvgrageMeter() kd_losses = AvgrageMeter()
model.student.train() t_accs = AvgrageMeter()
model.train()
step_id = 0 step_id = 0
for train_data in train_loader(): for train_data in train_loader():
batch_size = train_data[0].shape[0] batch_size = train_data[0].shape[0]
if use_data_parallel: if use_data_parallel:
total_loss, acc, ce_loss, kd_loss, _ = model._layers.loss( total_loss, acc, ce_loss, kd_loss, _, t_acc = model._layers.loss(
train_data, epoch) train_data, epoch)
else: else:
total_loss, acc, ce_loss, kd_loss, _ = model.loss(train_data, total_loss, acc, ce_loss, kd_loss, _, t_acc = model.loss(
epoch) train_data, epoch)
if use_data_parallel: if use_data_parallel:
total_loss = model.scale_loss(total_loss) total_loss = model.scale_loss(total_loss)
...@@ -63,19 +72,23 @@ def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel, ...@@ -63,19 +72,23 @@ def train_one_epoch(model, train_loader, optimizer, epoch, use_data_parallel,
accs.update(acc.numpy(), batch_size) accs.update(acc.numpy(), batch_size)
ce_losses.update(ce_loss.numpy(), batch_size) ce_losses.update(ce_loss.numpy(), batch_size)
kd_losses.update(kd_loss.numpy(), batch_size) kd_losses.update(kd_loss.numpy(), batch_size)
t_accs.update(t_acc.numpy(), batch_size)
if step_id % log_freq == 0: if step_id % log_freq == 0:
logger.info( logger.info(
"Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f};". "Train Epoch {}, Step {}, Lr {:.6f} total_loss {:.6f}; ce_loss {:.6f}, kd_loss {:.6f}, train_acc {:.6f}, teacher_acc {:.6f};".
format(epoch, step_id, format(epoch, step_id,
optimizer.current_step_lr(), total_losses.avg[0], optimizer.current_step_lr(), total_losses.avg[
ce_losses.avg[0], kd_losses.avg[0], accs.avg[0])) 0], ce_losses.avg[0], kd_losses.avg[0], accs.avg[0],
t_accs.avg[0]))
step_id += 1 step_id += 1
return total_losses.avg[0], accs.avg[0]
def main(): def main():
# whether use multi-gpus # whether use multi-gpus
use_data_parallel = False device_num = fluid.dygraph.parallel.Env().nranks
use_data_parallel = device_num > 1
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env( place = fluid.CUDAPlace(fluid.dygraph.parallel.Env(
).dev_id) if use_data_parallel else fluid.CUDAPlace(0) ).dev_id) if use_data_parallel else fluid.CUDAPlace(0)
...@@ -88,12 +101,12 @@ def main(): ...@@ -88,12 +101,12 @@ def main():
max_seq_len = 128 max_seq_len = 128
batch_size = 192 batch_size = 192
hidden_size = 768 hidden_size = 128
emb_size = 768 emb_size = 768
epoch = 80 epoch = 80
log_freq = 10 log_freq = 1
task_name = 'mnli' task_name = 'mrpc'
if task_name == 'mrpc': if task_name == 'mrpc':
data_dir = "./data/glue_data/MRPC/" data_dir = "./data/glue_data/MRPC/"
...@@ -110,7 +123,6 @@ def main(): ...@@ -110,7 +123,6 @@ def main():
num_labels = 3 num_labels = 3
processor_func = MnliProcessor processor_func = MnliProcessor
device_num = fluid.dygraph.parallel.Env().nranks
use_fixed_gumbel = True use_fixed_gumbel = True
train_phase = "train" train_phase = "train"
val_phase = "dev" val_phase = "dev"
...@@ -129,7 +141,11 @@ def main(): ...@@ -129,7 +141,11 @@ def main():
emb_size=emb_size, emb_size=emb_size,
teacher_model=teacher_model_dir, teacher_model=teacher_model_dir,
data_dir=data_dir, data_dir=data_dir,
use_fixed_gumbel=use_fixed_gumbel) use_fixed_gumbel=use_fixed_gumbel,
t=1.0)
logger.info("param size = {:.6f}MB".format(
count_parameters_in_MB(model.student.parameters())))
learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch) learning_rate = fluid.dygraph.CosineDecay(2e-2, step_per_epoch, epoch)
...@@ -174,7 +190,8 @@ def main(): ...@@ -174,7 +190,8 @@ def main():
capacity=128, capacity=128,
use_double_buffer=True, use_double_buffer=True,
iterable=True, iterable=True,
return_list=True) return_list=True,
use_multiprocess=True)
dev_loader = fluid.io.DataLoader.from_generator( dev_loader = fluid.io.DataLoader.from_generator(
capacity=128, capacity=128,
use_double_buffer=True, use_double_buffer=True,
...@@ -190,14 +207,18 @@ def main(): ...@@ -190,14 +207,18 @@ def main():
best_valid_acc = 0 best_valid_acc = 0
for epoch_id in range(epoch): for epoch_id in range(epoch):
train_one_epoch(model, train_loader, optimizer, epoch_id, total_loss, train_acc = train_one_epoch(
use_data_parallel, log_freq) model, train_loader, optimizer, epoch_id, use_data_parallel,
loss, acc = valid_one_epoch(model, dev_loader, epoch_id, log_freq) log_freq)
logger.info("train set, total_loss {:.6f}; acc {:.6f};".format(
total_loss, train_acc))
loss, acc, t_acc = valid_one_epoch(model, dev_loader, epoch_id,
log_freq)
if acc > best_valid_acc: if acc > best_valid_acc:
best_valid_acc = acc best_valid_acc = acc
logger.info( logger.info(
"dev set, ce_loss {:.6f}; acc {:.6f}, best_acc {:.6f};".format( "dev set, ce_loss {:.6f}; teacher_acc: {:.6f}, acc {:.6f}, best_acc {:.6f};".
loss, acc, best_valid_acc)) format(loss, t_acc, acc, best_valid_acc))
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -57,7 +57,7 @@ class AdaBERTClassifier(Layer): ...@@ -57,7 +57,7 @@ class AdaBERTClassifier(Layer):
use_fixed_gumbel=False, use_fixed_gumbel=False,
gumbel_alphas=None, gumbel_alphas=None,
fix_emb=False, fix_emb=False,
t=5.0): t=1.0):
super(AdaBERTClassifier, self).__init__() super(AdaBERTClassifier, self).__init__()
self._n_layer = n_layer self._n_layer = n_layer
self._num_labels = num_labels self._num_labels = num_labels
...@@ -78,8 +78,9 @@ class AdaBERTClassifier(Layer): ...@@ -78,8 +78,9 @@ class AdaBERTClassifier(Layer):
self.teacher = BERTClassifier( self.teacher = BERTClassifier(
num_labels, task_name=task_name, model_path=self._teacher_model) num_labels, task_name=task_name, model_path=self._teacher_model)
# global setting, will be overwritten when training(about 1% acc loss) # global setting, will be overwritten when training(about 1% acc loss)
self.teacher.eval()
self.teacher.test(self._data_dir) self.teacher.test(self._data_dir)
self.teacher.eval()
print( print(
"----------------------finish load teacher model and test----------------------------------------" "----------------------finish load teacher model and test----------------------------------------"
) )
...@@ -116,49 +117,67 @@ class AdaBERTClassifier(Layer): ...@@ -116,49 +117,67 @@ class AdaBERTClassifier(Layer):
def loss(self, data_ids, epoch): def loss(self, data_ids, epoch):
labels = data_ids[4] labels = data_ids[4]
s_logits = self.student(data_ids, epoch) s_logits, s_fea = self.student(data_ids, epoch)
# make sure techer is compute in eval mode
self.teacher.eval()
t_total_loss, t_logits, t_losses, t_accs, _, t_fea = self.teacher(
data_ids)
if self.student.training:
self.student.train()
t_logits[-1].stop_gradient = True
#kd_loss = fluid.layers.mse_loss(s_logits[-1], t_logits[-1])
#kd_loss = fluid.layers.mse_loss(s_fea, t_fea)
t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids) #kd_loss = fluid.layers.reduce_sum(fluid.layers.square(s_logits[-1] - t_logits[-1]))
t_probs = fluid.layers.softmax(t_logits[-1] / self.T)
s_probs = fluid.layers.softmax(s_logits[-1] / self.T)
kd_loss = fluid.layers.reduce_mean(
fluid.layers.cross_entropy(
input=s_probs, label=t_probs, soft_label=True))
#define kd loss #define kd loss
kd_weights = [] # kd_weights = []
for i in range(len(s_logits)): # for i in range(len(s_logits)):
j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) # j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
kd_weights.append(t_losses[j].numpy()) # kd_weights.append(t_losses[j].numpy())
kd_weights = np.array(kd_weights) # kd_weights = np.array(kd_weights)
kd_weights = np.squeeze(kd_weights) # kd_weights = np.squeeze(kd_weights)
kd_weights = to_variable(kd_weights) # kd_weights = to_variable(kd_weights)
kd_weights = fluid.layers.softmax(-kd_weights) # kd_weights = fluid.layers.softmax(-kd_weights)
kd_losses = [] # kd_losses = []
for i in range(len(s_logits)): # for i in range(len(s_logits)):
j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits)))) # j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
t_logit = t_logits[j] # t_logit = t_logits[j]
s_logit = s_logits[i] # s_logit = s_logits[i]
t_logit.stop_gradient = True # t_logit.stop_gradient = True
t_probs = fluid.layers.softmax(t_logit) # P_j^T # t_probs = fluid.layers.softmax(t_logit) # P_j^T
s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S # s_probs = fluid.layers.softmax(s_logit / self.T) #P_j^S
#kd_loss = -t_probs * fluid.layers.log(s_probs) # #kd_loss = -t_probs * fluid.layers.log(s_probs)
kd_loss = fluid.layers.cross_entropy( # kd_loss = fluid.layers.cross_entropy(
input=s_probs, label=t_probs, soft_label=True) # input=s_probs, label=t_probs, soft_label=True)
kd_loss = fluid.layers.reduce_mean(kd_loss) # kd_loss = fluid.layers.reduce_mean(kd_loss)
kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i]) # kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i])
kd_losses.append(kd_loss) # kd_losses.append(kd_loss)
kd_loss = fluid.layers.sum(kd_losses) # kd_loss = fluid.layers.sum(kd_losses)
losses = [] losses = []
for logit in s_logits: for logit in [s_logits[-1]]:
ce_loss, probs = fluid.layers.softmax_with_cross_entropy( ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
logits=logit, label=labels, return_softmax=True) logits=logit, label=labels, return_softmax=True)
#print("training: ", self.student.training, probs.numpy())
loss = fluid.layers.mean(x=ce_loss) loss = fluid.layers.mean(x=ce_loss)
losses.append(loss) losses.append(loss)
num_seqs = fluid.layers.create_tensor(dtype='int64') accuracy = fluid.layers.accuracy(input=probs, label=labels)
accuracy = fluid.layers.accuracy(
input=probs, label=labels, total=num_seqs)
ce_loss = fluid.layers.sum(losses) ce_loss = fluid.layers.sum(losses)
total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss #total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss
total_loss = kd_loss
return total_loss, accuracy, ce_loss, kd_loss, s_logits return total_loss, accuracy, ce_loss, kd_loss, s_logits, t_accs[-1]
...@@ -91,6 +91,11 @@ class BertModelLayer(Layer): ...@@ -91,6 +91,11 @@ class BertModelLayer(Layer):
output_dim=self._hidden_size, output_dim=self._hidden_size,
param_attr=fluid.ParamAttr(name="s_emb_factorization")) param_attr=fluid.ParamAttr(name="s_emb_factorization"))
self._emb_fac_1 = Linear(
input_dim=self._emb_size,
output_dim=self._hidden_size,
param_attr=fluid.ParamAttr(name="s_emb_factorization_1"))
self._encoder = EncoderLayer( self._encoder = EncoderLayer(
num_labels=num_labels, num_labels=num_labels,
n_layer=self._n_layer, n_layer=self._n_layer,
...@@ -103,10 +108,6 @@ class BertModelLayer(Layer): ...@@ -103,10 +108,6 @@ class BertModelLayer(Layer):
return self._src_emb.parameters() + self._pos_emb.parameters( return self._src_emb.parameters() + self._pos_emb.parameters(
) + self._sent_emb.parameters() ) + self._sent_emb.parameters()
def emb_names(self):
return self._src_emb.parameters() + self._pos_emb.parameters(
) + self._sent_emb.parameters()
def max_flops(self): def max_flops(self):
return self._encoder.max_flops return self._encoder.max_flops
...@@ -129,6 +130,6 @@ class BertModelLayer(Layer): ...@@ -129,6 +130,6 @@ class BertModelLayer(Layer):
emb_out_1 = self._emb_fac(src_emb_1) emb_out_1 = self._emb_fac(src_emb_1)
# (bs, seq_len, hidden_size) # (bs, seq_len, hidden_size)
enc_outputs = self._encoder(emb_out_0, emb_out_1, epoch) enc_outputs, fea = self._encoder(emb_out_0, emb_out_1, epoch)
return enc_outputs return enc_outputs, fea
...@@ -45,8 +45,8 @@ OPS = { ...@@ -45,8 +45,8 @@ OPS = {
'avg_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='avg'), 'avg_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='avg'),
'max_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='max'), 'max_pool_3': lambda n_channel, name: Pool2D(pool_size=(3,1), pool_padding=(1, 0), pool_type='max'),
'none': lambda n_channel, name: Zero(),
'skip_connect': lambda n_channel, name: Identity(), 'skip_connect': lambda n_channel, name: Identity(),
'none': lambda n_channel, name: Zero(),
} }
...@@ -61,10 +61,10 @@ class MixedOp(fluid.dygraph.Layer): ...@@ -61,10 +61,10 @@ class MixedOp(fluid.dygraph.Layer):
if 'pool' in primitive: if 'pool' in primitive:
gama = ParamAttr( gama = ParamAttr(
initializer=fluid.initializer.Constant(value=1), initializer=fluid.initializer.Constant(value=1),
trainable=False) trainable=True)
beta = ParamAttr( beta = ParamAttr(
initializer=fluid.initializer.Constant(value=0), initializer=fluid.initializer.Constant(value=0),
trainable=False) trainable=True)
BN = BatchNorm(n_channel, param_attr=gama, bias_attr=beta) BN = BatchNorm(n_channel, param_attr=gama, bias_attr=beta)
op = fluid.dygraph.Sequential(op, BN) op = fluid.dygraph.Sequential(op, BN)
ops.append(op) ops.append(op)
...@@ -125,7 +125,7 @@ class ReluConvBN(fluid.dygraph.Layer): ...@@ -125,7 +125,7 @@ class ReluConvBN(fluid.dygraph.Layer):
filter_size=[3, 1], filter_size=[3, 1],
dilation=1, dilation=1,
stride=1, stride=1,
affine=False, affine=True,
use_cudnn=True, use_cudnn=True,
name=None): name=None):
super(ReluConvBN, self).__init__() super(ReluConvBN, self).__init__()
...@@ -210,40 +210,40 @@ class EncoderLayer(Layer): ...@@ -210,40 +210,40 @@ class EncoderLayer(Layer):
super(EncoderLayer, self).__init__() super(EncoderLayer, self).__init__()
self._n_layer = n_layer self._n_layer = n_layer
self._hidden_size = hidden_size self._hidden_size = hidden_size
self._n_channel = 128 self._n_channel = hidden_size
self._steps = 3 self._steps = 3
self._n_ops = len(ConvBN_PRIMITIVES) self._n_ops = len(ConvBN_PRIMITIVES)
self.use_fixed_gumbel = use_fixed_gumbel self.use_fixed_gumbel = use_fixed_gumbel
self.stem0 = fluid.dygraph.Sequential( # self.stem0 = fluid.dygraph.Sequential(
Conv2D( # Conv2D(
num_channels=1, # num_channels=1,
num_filters=self._n_channel, # num_filters=self._n_channel,
filter_size=[3, self._hidden_size], # filter_size=[3, self._hidden_size],
padding=[1, 0], # padding=[1, 0],
param_attr=fluid.ParamAttr(initializer=MSRA()), # param_attr=fluid.ParamAttr(initializer=MSRA()),
bias_attr=False), # bias_attr=False),
BatchNorm( # BatchNorm(
num_channels=self._n_channel, # num_channels=self._n_channel,
param_attr=fluid.ParamAttr( # param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1)), # initializer=fluid.initializer.Constant(value=1)),
bias_attr=fluid.ParamAttr( # bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0)))) # initializer=fluid.initializer.Constant(value=0))))
self.stem1 = fluid.dygraph.Sequential( # self.stem1 = fluid.dygraph.Sequential(
Conv2D( # Conv2D(
num_channels=1, # num_channels=1,
num_filters=self._n_channel, # num_filters=self._n_channel,
filter_size=[3, self._hidden_size], # filter_size=[3, self._hidden_size],
padding=[1, 0], # padding=[1, 0],
param_attr=fluid.ParamAttr(initializer=MSRA()), # param_attr=fluid.ParamAttr(initializer=MSRA()),
bias_attr=False), # bias_attr=False),
BatchNorm( # BatchNorm(
num_channels=self._n_channel, # num_channels=self._n_channel,
param_attr=fluid.ParamAttr( # param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1)), # initializer=fluid.initializer.Constant(value=1)),
bias_attr=fluid.ParamAttr( # bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0)))) # initializer=fluid.initializer.Constant(value=0))))
cells = [] cells = []
for i in range(n_layer): for i in range(n_layer):
...@@ -271,10 +271,10 @@ class EncoderLayer(Layer): ...@@ -271,10 +271,10 @@ class EncoderLayer(Layer):
num_channels=self._n_channel, num_channels=self._n_channel,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1), initializer=fluid.initializer.Constant(value=1),
trainable=False), trainable=True),
bias_attr=fluid.ParamAttr( bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=0), initializer=fluid.initializer.Constant(value=0),
trainable=False)) trainable=True))
out = Linear( out = Linear(
self._n_channel, self._n_channel,
num_labels, num_labels,
...@@ -311,17 +311,28 @@ class EncoderLayer(Layer): ...@@ -311,17 +311,28 @@ class EncoderLayer(Layer):
s0 = fluid.layers.unsqueeze(enc_input_0, [1]) s0 = fluid.layers.unsqueeze(enc_input_0, [1])
s1 = fluid.layers.unsqueeze(enc_input_1, [1]) s1 = fluid.layers.unsqueeze(enc_input_1, [1])
s0 = self.stem0(s0) s0 = fluid.layers.transpose(s0, [0, 3, 2, 1])
s1 = self.stem1(s1) s1 = fluid.layers.transpose(s1, [0, 3, 2, 1])
# s0 = self.stem0(s0)
# s1 = self.stem1(s1)
enc_outputs = [] enc_outputs = []
fea = []
for i in range(self._n_layer): for i in range(self._n_layer):
s0, s1 = s1, self._cells[i](s0, s1, alphas) s0, s1 = s1, self._cells[i](s0, s1, alphas)
# (bs, n_channel, seq_len, 1) # (bs, n_channel, seq_len, 1)
tmp = self._bns[i](s1) tmp = self._bns[i](s1)
tmp = s1
tmp = self.pool2d_avg(tmp) tmp = self.pool2d_avg(tmp)
tmp = fluid.layers.reshape(tmp, shape=[-1, 0]) tmp = fluid.layers.reshape(tmp, shape=[-1, 0])
fea.append(tmp)
tmp = fluid.layers.dropout(
x=tmp,
dropout_prob=0.1,
dropout_implementation="upscale_in_train")
tmp = self._outs[i](tmp) tmp = self._outs[i](tmp)
enc_outputs.append(tmp) enc_outputs.append(tmp)
return enc_outputs return enc_outputs, fea[-1]
...@@ -120,12 +120,16 @@ class BERTClassifier(Layer): ...@@ -120,12 +120,16 @@ class BERTClassifier(Layer):
test_data_generator = processor.data_generator( test_data_generator = processor.data_generator(
batch_size=batch_size, phase='dev', epoch=1, shuffle=False) batch_size=batch_size, phase='dev', epoch=1, shuffle=False)
# test train mode test_acc
self.cls_model.eval() self.cls_model.eval()
print("test with test mode:...")
total_cost, final_acc, avg_acc, total_num_seqs = [], [], [], [] total_cost, final_acc, avg_acc, total_num_seqs = [], [], [], []
for batch in test_data_generator(): for batch in test_data_generator():
data_ids = create_data(batch) data_ids = create_data(batch)
total_loss, _, _, np_acces, np_num_seqs = self.cls_model(data_ids) total_loss, _, _, np_acces, np_num_seqs, fea = self.cls_model(
data_ids)
np_loss = total_loss.numpy() np_loss = total_loss.numpy()
np_acc = np_acces[-1].numpy() np_acc = np_acces[-1].numpy()
......
...@@ -118,4 +118,5 @@ class ClsModelLayer(Layer): ...@@ -118,4 +118,5 @@ class ClsModelLayer(Layer):
accuracys.append(accuracy) accuracys.append(accuracy)
total_loss = fluid.layers.sum(losses) total_loss = fluid.layers.sum(losses)
return total_loss, logits, losses, accuracys, num_seqs return total_loss, logits, losses, accuracys, num_seqs, next_sent_feat[
-1]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册