From d3e51bfcce92893d8171226b1e375b0e5b8346c1 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Mon, 20 Apr 2020 06:25:48 +0000 Subject: [PATCH] 1. Fix reader 2. Change gate conv to conv_bn_relu --- paddleslim/nas/darts/architect.py | 4 +- .../nas/darts/search_space/conv_bert/cls.py | 52 +++- .../search_space/conv_bert/model/bert.py | 28 +- .../conv_bert/model/transformer_encoder.py | 254 +++++++++++++----- paddleslim/nas/darts/train_search.py | 17 +- paddleslim/teachers/bert/reader/cls.py | 12 +- 6 files changed, 272 insertions(+), 95 deletions(-) diff --git a/paddleslim/nas/darts/architect.py b/paddleslim/nas/darts/architect.py index ed6889b8..4a0eed12 100644 --- a/paddleslim/nas/darts/architect.py +++ b/paddleslim/nas/darts/architect.py @@ -60,8 +60,8 @@ class Architect(object): def _backward_step(self, valid_data): loss = self.model.loss(valid_data) - loss.backward() - return loss + loss[0].backward() + return loss[0] def _backward_step_unrolled(self, train_data, valid_data): self._compute_unrolled_model(train_data) diff --git a/paddleslim/nas/darts/search_space/conv_bert/cls.py b/paddleslim/nas/darts/search_space/conv_bert/cls.py index b84ca0d1..e3540f31 100755 --- a/paddleslim/nas/darts/search_space/conv_bert/cls.py +++ b/paddleslim/nas/darts/search_space/conv_bert/cls.py @@ -41,27 +41,44 @@ __all__ = ["AdaBERTClassifier"] class AdaBERTClassifier(Layer): - def __init__(self, num_labels, n_layer=8, emb_size=768, + def __init__(self, + num_labels, + n_layer=8, + emb_size=128, + hidden_size=768, + gamma=0.8, + beta=4, + conv_type="conv_bn", + search_layer=True, teacher_model=None): super(AdaBERTClassifier, self).__init__() self._n_layer = n_layer self._num_labels = num_labels self._emb_size = emb_size + self._hidden_size = hidden_size + self._gamma = gamma + self._beta = beta + self._conv_type = conv_type + self._search_layer = search_layer print( "----------------------load teacher model and test----------------------------------------" ) self.teacher = BERTClassifier(num_labels, model_path=teacher_model) - # self.teacher.test("/work/PaddleSlim/demo/bert/data/glue_data/MNLI/") + self.teacher.test("/work/PaddleSlim/demo/bert/data/glue_data/MNLI/") print( "----------------------finish load teacher model and test----------------------------------------" ) self.student = BertModelLayer( - n_layer=self._n_layer, emb_size=self._emb_size) + n_layer=self._n_layer, + emb_size=self._emb_size, + hidden_size=self._hidden_size, + conv_type=self._conv_type, + search_layer=self._search_layer) self.cls_fc = list() for i in range(self._n_layer): fc = Linear( - input_dim=self._emb_size, + input_dim=self._hidden_size, output_dim=self._num_labels, param_attr=fluid.ParamAttr( name="s_cls_out_%d_w" % i, @@ -84,7 +101,14 @@ class AdaBERTClassifier(Layer): def genotype(self): return self.arch_parameters() - def loss(self, data_ids, beta=4, gamma=0.8): + def new(self): + model_new = AdaBERTClassifier( + 3, + teacher_model="/work/PaddleSlim/demo/bert_1/checkpoints/steps_23000" + ) + return model_new + + def loss(self, data_ids): T = 1.0 src_ids = data_ids[0] position_ids = data_ids[1] @@ -130,7 +154,7 @@ class AdaBERTClassifier(Layer): t_probs = fluid.layers.softmax(t_logit) s_probs = fluid.layers.softmax(s_logits) - t_probs.stop_gradient = False + t_probs.stop_gradient = True kd_loss = t_probs * fluid.layers.log(s_probs / T) kd_loss = fluid.layers.reduce_sum(kd_loss, dim=1) kd_loss = kd_loss * kd_weights[i] @@ -144,14 +168,16 @@ class AdaBERTClassifier(Layer): ce_loss = fluid.layers.reduce_mean(ce_loss) * k_i # define e loss - model_size = fluid.layers.sum( - model_size) / self.student.max_model_size() + model_size = fluid.layers.sum(model_size) + # print("model_size: {}".format(model_size.numpy()/1e6)) + model_size = model_size / self.student.max_model_size() flops = fluid.layers.sum(flops) / self.student.max_flops() e_loss = (len(next_sent_feats) * k_i / self._n_layer) * ( flops + model_size) # define total loss - loss = (1 - gamma) * ce_loss - gamma * kd_loss + beta * e_loss - print("ce_loss: {}; kd_loss: {}; e_loss: {}".format(( - 1 - gamma) * ce_loss.numpy(), -gamma * kd_loss.numpy(), beta * - e_loss.numpy())) - return loss + loss = (1 - self._gamma + ) * ce_loss - self._gamma * kd_loss + self._beta * e_loss + # print("ce_loss: {}; kd_loss: {}; e_loss: {}".format(( + # 1 - gamma) * ce_loss.numpy(), -gamma * kd_loss.numpy(), beta * + # e_loss.numpy())) + return loss, ce_loss, kd_loss, e_loss diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py index 1df7984b..ce9adbb7 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/model/bert.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/bert.py @@ -28,17 +28,21 @@ from .transformer_encoder import EncoderLayer class BertModelLayer(Layer): def __init__(self, - emb_size=768, + emb_size=128, + hidden_size=768, n_layer=12, voc_size=30522, max_position_seq_len=512, sent_types=2, return_pooled_out=True, initializer_range=1.0, + conv_type="conv_bn", + search_layer=True, use_fp16=False): super(BertModelLayer, self).__init__() self._emb_size = emb_size + self._hidden_size = hidden_size self._n_layer = n_layer self._voc_size = voc_size self._max_position_seq_len = max_position_seq_len @@ -50,6 +54,8 @@ class BertModelLayer(Layer): self._sent_emb_name = "s_sent_embedding" self._dtype = "float16" if use_fp16 else "float32" + self._conv_type = conv_type + self._search_layer = search_layer self._param_initializer = fluid.initializer.TruncatedNormal( scale=initializer_range) @@ -71,16 +77,24 @@ class BertModelLayer(Layer): name=self._sent_emb_name, initializer=self._param_initializer), dtype=self._dtype) - self.pooled_fc = Linear( + self._emb_fac = Linear( input_dim=self._emb_size, - output_dim=self._emb_size, + output_dim=self._hidden_size, + param_attr=fluid.ParamAttr(name="s_emb_factorization")) + + self.pooled_fc = Linear( + input_dim=self._hidden_size, + output_dim=self._hidden_size, param_attr=fluid.ParamAttr( name="s_pooled_fc.w_0", initializer=self._param_initializer), bias_attr="s_pooled_fc.b_0", act="tanh") self._encoder = EncoderLayer( - n_layer=self._n_layer, d_model=self._emb_size) + n_layer=self._n_layer, + hidden_size=self._hidden_size, + conv_type=self._conv_type, + search_layer=self._search_layer) def max_flops(self): return self._encoder.max_flops @@ -89,7 +103,7 @@ class BertModelLayer(Layer): return self._encoder.max_model_size def arch_parameters(self): - return [self._encoder.alphas] + return [self._encoder.alphas, self._encoder.k] def forward(self, src_ids, @@ -107,6 +121,8 @@ class BertModelLayer(Layer): emb_out = src_emb + pos_emb emb_out = emb_out + sent_emb + emb_out = self._emb_fac(emb_out) + enc_outputs, k_i = self._encoder( emb_out, flops=flops, model_size=model_size) @@ -118,7 +134,7 @@ class BertModelLayer(Layer): input=enc_output, axes=[1], starts=[0], ends=[1]) next_sent_feat = self.pooled_fc(next_sent_feat) next_sent_feat = fluid.layers.reshape( - next_sent_feat, shape=[-1, self._emb_size]) + next_sent_feat, shape=[-1, self._hidden_size]) next_sent_feats.append(next_sent_feat) return enc_outputs, next_sent_feats, k_i diff --git a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py index bdbac1c1..895917a3 100644 --- a/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py +++ b/paddleslim/nas/darts/search_space/conv_bert/model/transformer_encoder.py @@ -24,57 +24,92 @@ import paddle.fluid as fluid from paddle.fluid.dygraph import Embedding, LayerNorm, Linear, Layer, Conv2D, BatchNorm, Pool2D, to_variable from paddle.fluid.initializer import NormalInitializer -PRIMITIVES = [ - 'std_conv_3', 'std_conv_5', 'std_conv_7', 'dil_conv_3', 'dil_conv_5', - 'dil_conv_7', 'avg_pool_3', 'max_pool_3', 'none', 'skip_connect' +GConv_PRIMITIVES = [ + 'std_gconv_3', 'std_gconv_5', 'std_gconv_7', 'dil_gconv_3', 'dil_gconv_5', + 'dil_gconv_7', 'avg_pool_3', 'max_pool_3', 'none', 'skip_connect' ] -input_size = 128 * 768 +ConvBN_PRIMITIVES = [ + 'std_conv_bn_3', 'std_conv_bn_5', 'std_conv_bn_7', 'dil_conv_bn_3', + 'dil_conv_bn_5', 'dil_conv_bn_7', 'avg_pool_3', 'max_pool_3', 'none', + 'skip_connect' +] + +channel = 768 +input_size = 128 * 1 FLOPs = { - 'std_conv_3': input_size * 3 * 1, - 'std_conv_5': input_size * 5 * 1, - 'std_conv_7': input_size * 7 * 1, - 'dil_conv_3': input_size * 3 * 1, - 'dil_conv_5': input_size * 5 * 1, - 'dil_conv_7': input_size * 7 * 1, - 'avg_pool_3': input_size * 3 * 1, - 'max_pool_3': input_size * 3 * 1, + 'std_conv_bn_3': input_size * (channel**2) * 3, + 'std_conv_bn_5': input_size * (channel**2) * 5, + 'std_conv_bn_7': input_size * (channel**2) * 7, + 'dil_conv_bn_3': input_size * (channel**2) * 3, + 'dil_conv_bn_5': input_size * (channel**2) * 5, + 'dil_conv_bn_7': input_size * (channel**2) * 7, + 'std_gconv_3': input_size * (channel**2) * 3, + 'std_gconv_5': input_size * (channel**2) * 5, + 'std_gconv_7': input_size * (channel**2) * 7, + 'dil_gconv_3': input_size * (channel**2) * 3, + 'dil_gconv_5': input_size * (channel**2) * 5, + 'dil_gconv_7': input_size * (channel**2) * 7, + 'avg_pool_3': input_size * channel * 3 * 1, + 'max_pool_3': input_size * channel * 3 * 1, 'none': 0, 'skip_connect': 0, } ModelSize = { - 'std_conv_3': 3 * 1, - 'std_conv_5': 5 * 1, - 'std_conv_7': 7 * 1, - 'dil_conv_3': 3 * 1, - 'dil_conv_5': 5 * 1, - 'dil_conv_7': 7 * 1, + 'std_conv_bn_3': (channel**2) * 3 * 1, + 'std_conv_bn_5': (channel**2) * 5 * 1, + 'std_conv_bn_7': (channel**2) * 7 * 1, + 'dil_conv_bn_3': (channel**2) * 3 * 1, + 'dil_conv_bn_5': (channel**2) * 5 * 1, + 'dil_conv_bn_7': (channel**2) * 7 * 1, + 'std_gconv_3': (channel**2) * 3 * 1, + 'std_gconv_5': (channel**2) * 5 * 1, + 'std_gconv_7': (channel**2) * 7 * 1, + 'dil_gconv_3': (channel**2) * 3 * 1, + 'dil_gconv_5': (channel**2) * 5 * 1, + 'dil_gconv_7': (channel**2) * 7 * 1, 'avg_pool_3': 0, 'max_pool_3': 0, 'none': 0, 'skip_connect': 0, } + OPS = { - 'std_conv_3': lambda : ConvBN(1, 1, filter_size=3, dilation=1), - 'std_conv_5': lambda : ConvBN(1, 1, filter_size=5, dilation=1), - 'std_conv_7': lambda : ConvBN(1, 1, filter_size=7, dilation=1), - 'dil_conv_3': lambda : ConvBN(1, 1, filter_size=3, dilation=2), - 'dil_conv_5': lambda : ConvBN(1, 1, filter_size=5, dilation=2), - 'dil_conv_7': lambda : ConvBN(1, 1, filter_size=7, dilation=2), - 'avg_pool_3': lambda : Pool2D(pool_size=(3, 1), pool_padding=(1, 0), pool_type='avg'), - 'max_pool_3': lambda : Pool2D(pool_size=(3, 1), pool_padding=(1, 0), pool_type='max'), - 'none': lambda : Zero(), - 'skip_connect': lambda : Identity(), + 'std_gconv_3': lambda n_channel, name: GateConv(n_channel, n_channel, filter_size=[3, 1], dilation=1, name=name), + 'std_gconv_5': lambda n_channel, name: GateConv(n_channel, n_channel, filter_size=[5, 1], dilation=1, name=name), + 'std_gconv_7': lambda n_channel, name: GateConv(n_channel, n_channel, filter_size=[7, 1], dilation=1, name=name), + 'dil_gconv_3': lambda n_channel, name: GateConv(n_channel, n_channel, filter_size=[3, 1], dilation=2, name=name), + 'dil_gconv_5': lambda n_channel, name: GateConv(n_channel, n_channel, filter_size=[5, 1], dilation=2, name=name), + 'dil_gconv_7': lambda n_channel, name: GateConv(n_channel, n_channel, filter_size=[7, 1], dilation=2, name=name), + 'std_conv_bn_3': lambda n_channel, name: ConvBNRelu(n_channel, n_channel, filter_size=[3, 1], dilation=1, name=name), + 'std_conv_bn_5': lambda n_channel, name: ConvBNRelu(n_channel, n_channel, filter_size=[5, 1], dilation=1, name=name), + 'std_conv_bn_7': lambda n_channel, name: ConvBNRelu(n_channel, n_channel, filter_size=[7, 1], dilation=1, name=name), + 'dil_conv_bn_3': lambda n_channel, name: ConvBNRelu(n_channel, n_channel, filter_size=[3, 1], dilation=2, name=name), + 'dil_conv_bn_5': lambda n_channel, name: ConvBNRelu(n_channel, n_channel, filter_size=[5, 1], dilation=2, name=name), + 'dil_conv_bn_7': lambda n_channel, name: ConvBNRelu(n_channel, n_channel, filter_size=[7, 1], dilation=2, name=name), + + 'avg_pool_3': lambda n_channel, name: Pool2D(pool_size=(3, 1), pool_padding=(1, 0), pool_type='avg'), + 'max_pool_3': lambda n_channel, name: Pool2D(pool_size=(3, 1), pool_padding=(1, 0), pool_type='max'), + 'none': lambda n_channel, name: Zero(), + 'skip_connect': lambda n_channel, name: Identity(), } class MixedOp(fluid.dygraph.Layer): - def __init__(self): + def __init__(self, n_channel, name=None, conv_type="conv_bn"): super(MixedOp, self).__init__() - ops = [OPS[primitive]() for primitive in PRIMITIVES] + if conv_type == "conv_bn": + PRIMITIVES = ConvBN_PRIMITIVES + elif conv_type == "gconv": + PRIMITIVES = GConv_PRIMITIVES + ops = [ + OPS[primitive](n_channel, name + if name is None else name + "/" + primitive) + for primitive in PRIMITIVES + ] self._ops = fluid.dygraph.LayerList(ops) self.max_flops = max([FLOPs[primitive] for primitive in PRIMITIVES]) self.max_model_size = max( @@ -121,39 +156,88 @@ def gumbel_softmax(logits, temperature=0.1, hard=True, eps=1e-20): return out -class ConvBN(fluid.dygraph.Layer): +class ConvBNRelu(fluid.dygraph.Layer): def __init__(self, - out_ch, - in_ch, - filter_size=3, + in_c=768, + out_c=768, + filter_size=[3, 1], dilation=1, - act="relu", is_test=False, - use_cudnn=True): - super(ConvBN, self).__init__() - conv_std = (2.0 / (filter_size**2 * in_ch))**0.5 + use_cudnn=True, + name=None): + super(ConvBNRelu, self).__init__() + conv_std = (2.0 / + (filter_size[0] * filter_size[1] * out_c * in_c))**0.5 conv_param = fluid.ParamAttr( + name=name if name is None else (name + "_conv.weights"), initializer=fluid.initializer.Normal(0.0, conv_std)) - self.conv_layer = Conv2D( - in_ch, - out_ch, [filter_size, 1], + self.conv = Conv2D( + in_c, + out_c, + filter_size, dilation=[dilation, 1], - padding=[(filter_size - 1) * dilation // 2, 0], + padding=[(filter_size[0] - 1) * dilation // 2, 0], param_attr=conv_param, - bias_attr=False, act=None, + bias_attr=False, use_cudnn=use_cudnn) - self.bn_layer = BatchNorm(out_ch, act=act, is_test=is_test) + self.bn = BatchNorm(out_c, act="relu", is_test=False) def forward(self, inputs): - conv = self.conv_layer(inputs) - bn = self.bn_layer(conv) + conv = self.conv(inputs) + bn = self.bn(conv) return bn +class GateConv(fluid.dygraph.Layer): + def __init__(self, + in_c=768, + out_c=768, + filter_size=[3, 1], + dilation=1, + is_test=False, + use_cudnn=True, + name=None): + super(GateConv, self).__init__() + conv_std = (2.0 / + (filter_size[0] * filter_size[1] * out_c * in_c))**0.5 + conv_param = fluid.ParamAttr( + name=name if name is None else (name + "_conv.weights"), + initializer=fluid.initializer.Normal(0.0, conv_std)) + + gate_param = fluid.ParamAttr( + name=name if name is None else (name + "_conv_gate.weights"), + initializer=fluid.initializer.Normal(0.0, conv_std)) + + self.conv = Conv2D( + in_c, + out_c, + filter_size, + dilation=[dilation, 1], + padding=[(filter_size[0] - 1) * dilation // 2, 0], + param_attr=conv_param, + act=None, + use_cudnn=use_cudnn) + + self.gate = Conv2D( + in_c, + out_c, + filter_size, + dilation=[dilation, 1], + padding=[(filter_size[0] - 1) * dilation // 2, 0], + param_attr=gate_param, + act="sigmoid", + use_cudnn=use_cudnn) + + def forward(self, inputs): + conv = self.conv(inputs) + gate = self.gate(inputs) + return conv * gate + + class Cell(fluid.dygraph.Layer): - def __init__(self, steps): + def __init__(self, steps, n_channel, name=None, conv_type="conv_bn"): super(Cell, self).__init__() self._steps = steps @@ -162,7 +246,11 @@ class Cell(fluid.dygraph.Layer): ops = [] for i in range(self._steps): for j in range(2 + i): - op = MixedOp() + op = MixedOp( + n_channel, + name=name + if name is None else "%s/step%d_edge%d" % (name, i, j), + conv_type=conv_type) self.max_flops += op.max_flops self.max_model_size += op.max_model_size ops.append(op) @@ -191,19 +279,49 @@ class EncoderLayer(Layer): encoder """ - def __init__(self, n_layer, d_model=128, name=""): - + def __init__(self, + n_layer, + hidden_size=768, + name="encoder", + conv_type="conv_bn", + search_layer=True): super(EncoderLayer, self).__init__() cells = [] self._n_layer = n_layer - self._d_model = d_model + self._hidden_size = hidden_size self._steps = 3 + self._search_layer = search_layer self.max_flops = 0 self.max_model_size = 0 + if conv_type == "conv_bn": + self._n_ops = len(ConvBN_PRIMITIVES) + self.conv0 = ConvBNRelu( + in_c=1, + out_c=self._hidden_size, + filter_size=[3, self._hidden_size], + dilation=1, + is_test=False, + use_cudnn=True, + name="conv0") + + elif conv_type == "gconv": + self._n_ops = len(GConv_PRIMITIVES) + self.conv0 = GateConv( + in_c=1, + out_c=self._hidden_size, + filter_size=[3, self._hidden_size], + dilation=1, + is_test=False, + use_cudnn=True, + name="conv0") cells = [] for i in range(n_layer): - cell = Cell(steps=self._steps) + cell = Cell( + steps=self._steps, + n_channel=self._hidden_size, + name="%s/layer_%d" % (name, i), + conv_type=conv_type) cells.append(cell) self.max_flops += cell.max_flops self.max_model_size += cell.max_model_size @@ -211,7 +329,7 @@ class EncoderLayer(Layer): self._cells = fluid.dygraph.LayerList(cells) k = sum(1 for i in range(self._steps) for n in range(2 + i)) - num_ops = len(PRIMITIVES) + num_ops = self._n_ops self.alphas = fluid.layers.create_parameter( shape=[k, num_ops], dtype="float32", @@ -225,14 +343,11 @@ class EncoderLayer(Layer): loc=0.0, scale=1e-3)) def forward(self, enc_input, flops=[], model_size=[]): - """ - forward - :param enc_input: - :param attn_bias: - :return: - """ - tmp = fluid.layers.reshape(enc_input, - [-1, 1, enc_input.shape[1], self._d_model]) + tmp = fluid.layers.reshape( + enc_input, [-1, 1, enc_input.shape[1], + self._hidden_size]) #(bs, 1, seq_len, hidden_size) + + tmp = self.conv0(tmp) # (bs, hidden_size, seq_len, 1) alphas = gumbel_softmax(self.alphas) k = fluid.layers.reshape(gumbel_softmax(self.k), [-1]) @@ -240,15 +355,16 @@ class EncoderLayer(Layer): outputs = [] s0 = s1 = tmp for i in range(self._n_layer): - s0, s1 = s1, self._cells[i](s0, - s1, - alphas, - flops=flops, - model_size=model_size) + s0, s1 = s1, self._cells[i]( + s0, s1, alphas, flops=flops, + model_size=model_size) # (bs, hidden_size, seq_len, 1) + enc_output = fluid.layers.transpose( + s1, [0, 2, 1, 3]) # (bs, seq_len, hidden_size, 1) enc_output = fluid.layers.reshape( - s1, [-1, enc_input.shape[1], self._d_model]) + enc_output, [-1, enc_output.shape[1], + self._hidden_size]) # (bs, seq_len, hidden_size) outputs.append(enc_output) - if k[i].numpy() != 0: + if self._search_layer and k[i].numpy() != 0: outputs[-1] = outputs[-1] * k[i] return outputs, k[i] - return None + return outputs, 1.0 diff --git a/paddleslim/nas/darts/train_search.py b/paddleslim/nas/darts/train_search.py index f8049952..cf832f8b 100644 --- a/paddleslim/nas/darts/train_search.py +++ b/paddleslim/nas/darts/train_search.py @@ -74,6 +74,9 @@ class DARTSearch(object): def train_one_epoch(self, train_loader, valid_loader, architect, optimizer, epoch): objs = AvgrageMeter() + ce_losses = AvgrageMeter() + kd_losses = AvgrageMeter() + e_losses = AvgrageMeter() self.model.train() step_id = 0 @@ -81,7 +84,7 @@ class DARTSearch(object): if epoch >= self.epochs_no_archopt: architect.step(train_data, valid_data) - loss = self.model.loss(train_data) + loss, ce_loss, kd_loss, e_loss = self.model.loss(train_data) if self.use_data_parallel: loss = self.model.scale_loss(loss) @@ -96,10 +99,18 @@ class DARTSearch(object): batch_size = train_data[0].shape[0] objs.update(loss.numpy(), batch_size) + ce_losses.update(ce_loss.numpy(), batch_size) + kd_losses.update(kd_loss.numpy(), batch_size) + e_losses.update(e_loss.numpy(), batch_size) if step_id % self.log_freq == 0: - logger.info("Train Epoch {}, Step {}, loss {:.6f}".format( - epoch, step_id, objs.avg[0])) + #logger.info("Train Epoch {}, Step {}, loss {:.6f}; ce: {:.6f}; kd: {:.6f}; e: {:.6f}".format( + # epoch, step_id, objs.avg[0], ce_losses.avg[0], kd_losses.avg[0], e_losses.avg[0])) + logger.info( + "Train Epoch {}, Step {}, loss {}; ce: {}; kd: {}; e: {}". + format(epoch, step_id, + loss.numpy(), + ce_loss.numpy(), kd_loss.numpy(), e_loss.numpy())) step_id += 1 return objs.avg[0] diff --git a/paddleslim/teachers/bert/reader/cls.py b/paddleslim/teachers/bert/reader/cls.py index 60bd5505..e05f02a3 100644 --- a/paddleslim/teachers/bert/reader/cls.py +++ b/paddleslim/teachers/bert/reader/cls.py @@ -144,6 +144,14 @@ class DataProcessor(object): elif phase == 'test': examples = self.get_test_examples(self.data_dir) self.num_examples['test'] = len(examples) + elif phase == 'search_train': + examples = self.get_train_examples(self.data_dir) + self.num_examples['search_train'] = len(examples) / 2 + examples = examples[:self.num_examples['search_train']] + elif phase == 'search_valid': + examples = self.get_train_examples(self.data_dir) + self.num_examples['search_valid'] = len(examples) / 2 + examples = examples[self.num_examples['search_train']:] else: raise ValueError( "Unknown phase, which should be in ['train', 'dev', 'test'].") @@ -154,10 +162,10 @@ class DataProcessor(object): if shuffle_seed is not None: np.random.seed(shuffle_seed) np.random.shuffle(examples) - if phase == 'train': + if phase == 'train' or phase == 'search_train': self.current_train_epoch = epoch_index for (index, example) in enumerate(examples): - if phase == 'train': + if phase == 'train' or phase == "search_train": self.current_train_example = index + 1 feature = self.convert_example( index, example, -- GitLab