未验证 提交 a35619b8 编写于 作者: Z zhouzj 提交者: GitHub

add skd distillation. (#1587)

* add skd distillation.

* update skd's test.
上级 bddce3ea
...@@ -12,5 +12,5 @@ ...@@ -12,5 +12,5 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from .single_distiller import merge, fsp, l2, soft_label, loss, dkd from .single_distiller import merge, fsp, l2, soft_label, loss, dkd, skd
from .dml import DML from .dml import DML
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
import numpy as np import numpy as np
import paddle import paddle
from paddleslim.core import GraphWrapper from paddleslim.core import GraphWrapper
import paddle.nn.functional as F
def _find_var_from_program(program, var_name): def _find_var_from_program(program, var_name):
...@@ -300,7 +301,10 @@ def soft_label(teacher_var_name, ...@@ -300,7 +301,10 @@ def soft_label(teacher_var_name,
teacher_temperature) teacher_temperature)
soft_label_loss = paddle.mean( soft_label_loss = paddle.mean(
paddle.nn.functional.cross_entropy( paddle.nn.functional.cross_entropy(
input=student_var, label=teacher_var, soft_label=True)) input=student_var,
label=teacher_var,
soft_label=True,
use_softmax=False))
return soft_label_loss return soft_label_loss
...@@ -401,3 +405,53 @@ def dkd(teacher_var_name, ...@@ -401,3 +405,53 @@ def dkd(teacher_var_name,
temperature=temperature, temperature=temperature,
alpha=alpha, alpha=alpha,
beta=beta) beta=beta)
def skd(teacher_var_name, student_var_name, program=None, multiplier=None):
"""Combine variables from student model and teacher model
by Spherical Knowledge Distillation loss (aka. skd-loss).
Reference: https://github.com/forjiuzhou/Spherical-Knowledge-Distillation
Args:
teacher_var_name(str): The name of teacher_var.
student_var_name(str): The name of student_var.
program(Program): The input distiller program. If not specified,
the default program will be used. Default: None
multiplier(float): The multiplier to recover its norm to the original
level. When it's None, the appropriate multiplier can be computed by
teacher's logits with paddle.std(output_t, axis=1). Default: None.
Returns:
Variable: skd distiller loss.
"""
if program == None:
program = paddle.static.default_main_program()
student_var = program.global_block().var(student_var_name)
teacher_var = program.global_block().var(teacher_var_name)
teacher_var.stop_gradient = True
if multiplier is None:
multiplier = paddle.std(teacher_var, axis=1, keepdim=True)
logits_student = F.layer_norm(
student_var,
student_var.shape[1:],
weight=None,
bias=None,
epsilon=1e-7) * multiplier
logits_teacher = F.layer_norm(
teacher_var,
teacher_var.shape[1:],
weight=None,
bias=None,
epsilon=1e-7) * multiplier
student_out = F.softmax(logits_student, axis=1)
teacher_out = F.softmax(logits_teacher, axis=1)
skd_loss = paddle.mean(
F.cross_entropy(
input=student_out,
label=teacher_out,
soft_label=True,
use_softmax=False))
return skd_loss
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
sys.path.append("../")
import unittest
import paddle
from paddleslim.dist import merge, skd
from layers import conv_bn_layer
from static_case import StaticCase
class TestSKDLoss(StaticCase):
def test_skd_loss(self):
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
student_program = paddle.static.Program()
student_startup = paddle.static.Program()
with paddle.static.program_guard(student_program, student_startup):
with paddle.utils.unique_name.guard():
input = paddle.static.data(
name="image", shape=[None, 3, 224, 224])
conv1 = conv_bn_layer(input, 8, 3, "conv1")
conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
student_predict = conv1 + conv2
teacher_program = paddle.static.Program()
teacher_startup = paddle.static.Program()
with paddle.static.program_guard(teacher_program, teacher_startup):
with paddle.utils.unique_name.guard():
input = paddle.static.data(
name="image", shape=[None, 3, 224, 224])
conv1 = conv_bn_layer(input, 8, 3, "conv1")
conv2 = conv_bn_layer(conv1, 8, 3, "conv2")
sum1 = conv1 + conv2
conv3 = conv_bn_layer(sum1, 8, 3, "conv3")
conv4 = conv_bn_layer(conv3, 8, 3, "conv4")
sum2 = conv4 + sum1
conv5 = conv_bn_layer(sum2, 8, 3, "conv5")
teacher_predict = conv_bn_layer(conv5, 8, 3, "conv6")
exe.run(teacher_startup)
exe.run(student_startup)
data_name_map = {'image': 'image'}
merge(teacher_program, student_program, data_name_map, place)
merged_ops = []
for block in student_program.blocks:
for op in block.ops:
merged_ops.append(op.type)
with paddle.static.program_guard(student_program, student_startup):
distill_loss = skd('teacher_' + teacher_predict.name,
student_predict.name,
program=None,
multiplier=None)
loss_ops = []
for block in student_program.blocks:
for op in block.ops:
loss_ops.append(op.type)
print(f"ret: {set(loss_ops).difference(set(merged_ops))}")
self.assertTrue(set(merged_ops).difference(set(loss_ops)) == set())
self.assertTrue({
'softmax_with_cross_entropy', 'softmax', 'reduce_mean', 'layer_norm'
}.issubset(set(loss_ops).difference(set(merged_ops))))
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册