distill.py

#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import sys
import os 

import numpy as np
from sklearn.metrics import f1_score
import paddle as P
import paddle.fluid as F
import paddle.fluid.layers as L
import paddle.fluid.dygraph as D
import propeller.paddle  as propeller

from ernie.tokenizing_ernie import ErnieTokenizer
from ernie.modeling_ernie import ErnieModelForSequenceClassification
from ernie.optimization import AdamW, LinearDecay


# 本例子采用chnsenticorp中文情感识别任务作为示范；并且事先通过数据增强扩充了蒸馏所需的无监督数据
# 
# 下载数据；并存放在 ./chnsenticorp-data/
# 数据分为3列：原文；空格切词；情感标签
# 其中第一列为ERNIE的输入；第二列为BoW词袋模型的输入
# 事先统计好的BoW 词典在 ./chnsenticorp-data/vocab.bow.txt

# 定义finetune teacher模型所需要的超参数
DATA_DIR='./chnsenticorp-data/'
SEQLEN=256
BATCH=32
EPOCH=10
LR=5e-5

tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')

student_vocab = {i.strip(): l for l, i in enumerate(open(os.path.join(DATA_DIR, 'vocab.bow.txt')).readlines())}

def space_tokenizer(i):
    return i.decode('utf8').split()

feature_column = propeller.data.FeatureColumns([
    propeller.data.TextColumn('seg_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer.tokenize),
    propeller.data.TextColumn('seg_a_student', unk_id=student_vocab['[UNK]'], vocab_dict=student_vocab, tokenizer=space_tokenizer),
    propeller.data.LabelColumn('label', vocab_dict={
        b"0": 0,
        b"1": 1,
    }),
])

def map_fn(seg_a, seg_a_student, label):
    seg_a, _ = tokenizer.truncate(seg_a, [], seqlen=SEQLEN)
    sentence, segments = tokenizer.build_for_ernie(seg_a)
    return seg_a_student, sentence, segments, label


train_ds = feature_column.build_dataset('train', data_dir=os.path.join(DATA_DIR, 'train/'), shuffle=True, repeat=False, use_gz=False) \
    .map(map_fn) \
    .padded_batch(BATCH)

train_ds_unlabel = feature_column.build_dataset('train-da', data_dir=os.path.join(DATA_DIR, 'train-data-augmented/'), shuffle=True, repeat=False, use_gz=False) \
    .map(map_fn) \
    .padded_batch(BATCH)

dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(DATA_DIR, 'dev/'), shuffle=False, repeat=False, use_gz=False) \
    .map(map_fn) \
    .padded_batch(BATCH,) 

shapes = ([-1,SEQLEN],[-1,SEQLEN], [-1, SEQLEN], [-1])
types = ('int64', 'int64', 'int64', 'int64')

train_ds.data_shapes = shapes
train_ds.data_types = types
train_ds_unlabel.data_shapes = shapes
train_ds_unlabel.data_types = types
dev_ds.data_shapes = shapes
dev_ds.data_types = types

place = F.CUDAPlace(0)
D.guard(place).__enter__()

def evaluate_teacher(model, dataset):
    all_pred, all_label = [], []
    with D.base._switch_tracer_mode_guard_(is_train=False):
        model.eval()
        for step, (ids_student, ids, _, labels) in enumerate(dataset.start()):
            _, logits = model(ids)
            pred = L.argmax(logits, -1)
            all_pred.extend(pred.numpy())
            all_label.extend(labels.numpy())
        f1 = f1_score(all_label, all_pred, average='macro')
        model.train()
        return f1
    

teacher_model = ErnieModelForSequenceClassification.from_pretrained('ernie-1.0', num_labels=2)
teacher_model.train()
if not os.path.exists('./teacher_model.pdparams'):
    g_clip = F.clip.GradientClipByGlobalNorm(1.0)
    opt = AdamW(learning_rate=LinearDecay(LR, 9600*EPOCH*0.1/BATCH, 9600*EPOCH/BATCH), parameter_list=teacher_model.parameters(), weight_decay=0.01, grad_clip=g_clip)
    for epoch in range(EPOCH):
        for step, (ids_student, ids, sids, labels) in enumerate(train_ds.start(place)):
            loss, logits = teacher_model(ids, labels=labels)
            loss.backward()
            if step % 10 == 0:
                print('[step %03d] teacher train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
            opt.minimize(loss)
            teacher_model.clear_gradients()
            if step % 100 == 0:
                f1 = evaluate_teacher(teacher_model, dev_ds)
                print('teacher f1: %.5f' %f1)
    D.save_dygraph(teacher_model.state_dict(), './teacher_model')
else:
    state_dict, _ = D.load_dygraph('./teacher_model')
    teacher_model.set_dict(state_dict)
    f1 = evaluate_teacher(teacher_model, dev_ds)
    print('teacher f1: %.5f' %f1)


# 定义finetune student 模型所需要的超参数
SEQLEN=256
BATCH=100
EPOCH=10
LR=1e-4


def evaluate_student(model, dataset):
    all_pred, all_label = [], []
    with D.base._switch_tracer_mode_guard_(is_train=False):
        model.eval()
        for step, (ids_student, ids, _, labels) in enumerate(dataset.start()):
            _, logits = model(ids_student)
            pred = L.argmax(logits, -1)
            all_pred.extend(pred.numpy())
            all_label.extend(labels.numpy())
        f1 = f1_score(all_label, all_pred, average='macro')
        model.train()
        return f1  


class BOW(D.Layer):
    def __init__(self):
        super().__init__()
        self.emb = D.Embedding([len(student_vocab), 128], padding_idx=0)
        self.fc = D.Linear(128, 2)
    def forward(self, ids, labels=None):
        embbed = self.emb(ids)
        pad_mask = L.unsqueeze(L.cast(ids!=0, 'float32'), [-1])

        embbed = L.reduce_sum(embbed * pad_mask, 1)
        embbed = L.softsign(embbed)
        logits = self.fc(embbed)
        if labels is not None:
            if len(labels.shape)==1:
                labels = L.reshape(labels, [-1, 1])
            loss = L.softmax_with_cross_entropy(logits, labels)
            loss = L.reduce_mean(loss)
        else:
            loss = None
        return loss, logits

class CNN(D.Layer):
    def __init__(self):
        super().__init__()
        self.emb = D.Embedding([30002, 128], padding_idx=0)
        self.cnn = D.Conv2D(128, 128, (1, 3), padding=(0, 1), act='relu')
        self.pool = D.Pool2D((1, 3), pool_padding=(0, 1))
        self.fc = D.Linear(128, 2)
    def forward(self, ids, labels=None):
        embbed = self.emb(ids)
        #d_batch, d_seqlen = ids.shape
        hidden = embbed
        hidden = L.transpose(hidden, [0, 2, 1]) #change to NCWH
        hidden = L.unsqueeze(hidden, [2])
        hidden = self.cnn(hidden)
        hidden = self.pool(hidden)
        hidden = L.squeeze(hidden, [2])
        hidden = L.transpose(hidden, [0, 2, 1])
        pad_mask = L.unsqueeze(L.cast(ids!=0, 'float32'), [-1])
        hidden = L.softsign(L.reduce_sum(hidden * pad_mask, 1))
        logits = self.fc(hidden)
        if labels is not None:
            if len(labels.shape)==1:
                labels = L.reshape(labels, [-1, 1])
            loss = L.softmax_with_cross_entropy(logits, labels)
            loss = L.reduce_mean(loss)
        else:
            loss = None
        return loss, logits

def KL(pred, target):
    pred = L.log(L.softmax(pred))
    target = L.softmax(target)
    loss = L.kldiv_loss(pred, target)
    return loss
    
teacher_model.eval()
model = BOW()
g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
opt = AdamW(learning_rate=LR, parameter_list=model.parameters(), weight_decay=0.01, grad_clip=g_clip)
model.train()
for epoch in range(EPOCH):
    for step, (ids_student, ids, sids, label) in enumerate(train_ds.start(place)):
        _, logits_t = teacher_model(ids, sids) # teacher 模型输出logits
        logits_t.stop_gradient=True
        _, logits_s = model(ids_student) # student 模型输出logits
        loss_ce, _ = model(ids_student, labels=label)
        loss_kd = KL(logits_s, logits_t)    # 由KL divergence度量两个分布的距离
        loss = loss_ce + loss_kd
        loss.backward()
        if step % 10 == 0:
            print('[step %03d] distill train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
        opt.minimize(loss)
        model.clear_gradients()
    f1 = evaluate_student(model, dev_ds)
    print('student f1 %.5f' % f1)

# 最后再加一轮hard label训练巩固结果
for step, (ids_student, ids, sids, label) in enumerate(train_ds.start(place)):
    loss, _ = model(ids_student, labels=label)
    loss.backward()
    if step % 10 == 0:
        print('[step %03d] train loss %.5f lr %.3e' % (step, loss.numpy(), opt.current_step_lr()))
    opt.minimize(loss)
    model.clear_gradients()

f1 = evaluate_student(model, dev_ds)
print('final f1 %.5f' % f1)