cls.py

#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""BERT fine-tuning in Paddle Dygraph Mode."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import six
import sys
if six.PY2:
    reload(sys)
    sys.setdefaultencoding('utf8')
import ast
import time
import argparse
import numpy as np
import multiprocessing
import paddle
import paddle.fluid as fluid
from paddle.fluid.dygraph import to_variable, Layer, Linear
from paddle.fluid.dygraph.base import to_variable
from .reader.cls import *
from .model.bert import BertModelLayer
from .optimization import Optimizer
from .utils.init import init_from_static_model
from paddleslim.teachers.bert import BERTClassifier

__all__ = ["AdaBERTClassifier"]


class AdaBERTClassifier(Layer):
    def __init__(self,
                 num_labels,
                 n_layer=8,
                 emb_size=768,
                 hidden_size=768,
                 gamma=0.8,
                 beta=4,
                 task_name='mnli',
                 conv_type="conv_bn",
                 search_layer=False,
                 teacher_model=None,
                 data_dir=None,
                 use_fixed_gumbel=False,
                 gumbel_alphas=None,
                 fix_emb=False,
                 t=5.0):
        super(AdaBERTClassifier, self).__init__()
        self._n_layer = n_layer
        self._num_labels = num_labels
        self._emb_size = emb_size
        self._hidden_size = hidden_size
        self._gamma = gamma
        self._beta = beta
        self._conv_type = conv_type
        self._search_layer = search_layer
        self._teacher_model = teacher_model
        self._data_dir = data_dir
        self.use_fixed_gumbel = use_fixed_gumbel

        self.T = t
        print(
            "----------------------load teacher model and test----------------------------------------"
        )
        self.teacher = BERTClassifier(
            num_labels, task_name=task_name, model_path=self._teacher_model)
        # global setting, will be overwritten when training(about 1% acc loss)
        self.teacher.eval()
        self.teacher.test(self._data_dir)
        print(
            "----------------------finish load teacher model and test----------------------------------------"
        )
        self.student = BertModelLayer(
            num_labels=num_labels,
            n_layer=self._n_layer,
            emb_size=self._emb_size,
            hidden_size=self._hidden_size,
            conv_type=self._conv_type,
            search_layer=self._search_layer,
            use_fixed_gumbel=self.use_fixed_gumbel,
            gumbel_alphas=gumbel_alphas)

        fix_emb = False
        for s_emb, t_emb in zip(self.student.emb_names(),
                                self.teacher.emb_names()):
            t_emb.stop_gradient = True
            if fix_emb:
                s_emb.stop_gradient = True
            print(
                "Assigning embedding[{}] from teacher to embedding[{}] in student.".
                format(t_emb.name, s_emb.name))
            fluid.layers.assign(input=t_emb, output=s_emb)
            print(
                "Assigned embedding[{}] from teacher to embedding[{}] in student.".
                format(t_emb.name, s_emb.name))

    def forward(self, data_ids, epoch):
        return self.student(data_ids, epoch)

    def arch_parameters(self):
        return self.student.arch_parameters()

    def loss(self, data_ids, epoch):
        labels = data_ids[4]

        s_logits = self.student(data_ids, epoch)

        t_enc_outputs, t_logits, t_losses, t_accs, _ = self.teacher(data_ids)

        #define kd loss
        kd_weights = []
        for i in range(len(s_logits)):
            j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
            kd_weights.append(t_losses[j].numpy())

        kd_weights = np.array(kd_weights)
        kd_weights = np.squeeze(kd_weights)
        kd_weights = to_variable(kd_weights)
        kd_weights = fluid.layers.softmax(-kd_weights)

        kd_losses = []
        for i in range(len(s_logits)):
            j = int(np.ceil(i * (float(len(t_logits)) / len(s_logits))))
            t_logit = t_logits[j]
            s_logit = s_logits[i]
            t_logit.stop_gradient = True
            t_probs = fluid.layers.softmax(t_logit)  # P_j^T
            s_probs = fluid.layers.softmax(s_logit / self.T)  #P_j^S
            #kd_loss = -t_probs * fluid.layers.log(s_probs)
            kd_loss = fluid.layers.cross_entropy(
                input=s_probs, label=t_probs, soft_label=True)
            kd_loss = fluid.layers.reduce_mean(kd_loss)
            kd_loss = fluid.layers.scale(kd_loss, scale=kd_weights[i])
            kd_losses.append(kd_loss)
        kd_loss = fluid.layers.sum(kd_losses)

        losses = []
        for logit in s_logits:
            ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
                logits=logit, label=labels, return_softmax=True)
            loss = fluid.layers.mean(x=ce_loss)
            losses.append(loss)

            num_seqs = fluid.layers.create_tensor(dtype='int64')
            accuracy = fluid.layers.accuracy(
                input=probs, label=labels, total=num_seqs)
        ce_loss = fluid.layers.sum(losses)

        total_loss = (1 - self._gamma) * ce_loss + self._gamma * kd_loss

        return total_loss, accuracy, ce_loss, kd_loss, s_logits