未验证 提交 1de853ee 编写于 作者: B Bai Yifan 提交者: GitHub

Refine distillation demo (#51)

* refine distillation demo
上级 eb48cb63
# 知识蒸馏示例
本示例将介绍如何使用知识蒸馏接口训练模型,蒸馏训练得到的模型相比不使用蒸馏策略的基线模型在精度上会有一定的提升。
## 接口介绍
请参考 [知识蒸馏API文档](https://paddlepaddle.github.io/PaddleSlim/api/single_distiller_api/)
### 1. 蒸馏训练配置
示例使用ResNet50_vd作为teacher模型,对MobileNet结构的student网络进行蒸馏训练。
默认配置:
```yaml
batch_size: 256
init_lr: 0.1
lr_strategy: piecewise_decay
l2_decay: 3e-5
momentum_rate: 0.9
num_epochs: 120
data: imagenet
```
训练使用默认配置启动即可
### 2. 启动训练
在配置好ImageNet数据集后,用以下命令启动训练即可:
```shell
CUDA_VISIBLE_DEVICES=0,1,2,3 python distill.py
```
### 3. 训练结果
对比不使用蒸馏策略的基线模型(Top-1/Top-5: 70.99%/89.68%),
经过120轮的蒸馏训练,MobileNet模型的Top-1/Top-5准确率达到72.77%/90.68%, Top-1/Top-5性能提升+1.78%/+1.00%
详细实验数据请参见[PaddleSlim模型库蒸馏部分](https://paddlepaddle.github.io/PaddleSlim/model_zoo/#13)
......@@ -23,7 +23,7 @@ _logger.setLevel(logging.INFO)
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 64*4, "Minibatch size.")
add_arg('batch_size', int, 64, "Minibatch size.")
add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
add_arg('total_images', int, 1281167, "Training image number.")
add_arg('image_shape', str, "3,224,224", "Input image size")
......@@ -32,12 +32,12 @@ add_arg('lr_strategy', str, "piecewise_decay", "The learning rate decay
add_arg('l2_decay', float, 3e-5, "The l2_decay parameter.")
add_arg('momentum_rate', float, 0.9, "The value of momentum_rate.")
add_arg('num_epochs', int, 120, "The number of total epochs.")
add_arg('data', str, "cifar10", "Which data to use. 'cifar10' or 'imagenet'")
add_arg('data', str, "imagenet", "Which data to use. 'cifar10' or 'imagenet'")
add_arg('log_period', int, 20, "Log period in batches.")
add_arg('model', str, "MobileNet", "Set the network to use.")
add_arg('pretrained_model', str, None, "Whether to use pretrained model.")
add_arg('teacher_model', str, "ResNet50", "Set the teacher network to use.")
add_arg('teacher_pretrained_model', str, "./ResNet50_pretrained", "Whether to use pretrained model.")
add_arg('teacher_model', str, "ResNet50_vd", "Set the teacher network to use.")
add_arg('teacher_pretrained_model', str, "./ResNet50_vd_pretrained", "Whether to use pretrained model.")
parser.add_argument('--step_epochs', nargs='+', type=int, default=[30, 60, 90], help="piecewise decay step")
# yapf: enable
......@@ -45,7 +45,12 @@ model_list = [m for m in dir(models) if "__" not in m]
def piecewise_decay(args):
step = int(math.ceil(float(args.total_images) / args.batch_size))
if args.use_gpu:
devices_num = fluid.core.get_cuda_device_count()
else:
devices_num = int(os.environ.get('CPU_NUM', 1))
step = int(math.ceil(float(args.total_images) /
args.batch_size)) * devices_num
bd = [step * e for e in args.step_epochs]
lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)]
learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr)
......@@ -53,18 +58,23 @@ def piecewise_decay(args):
learning_rate=learning_rate,
momentum=args.momentum_rate,
regularization=fluid.regularizer.L2Decay(args.l2_decay))
return optimizer
return learning_rate, optimizer
def cosine_decay(args):
step = int(math.ceil(float(args.total_images) / args.batch_size))
if cfg.use_gpu:
devices_num = fluid.core.get_cuda_device_count()
else:
devices_num = int(os.environ.get('CPU_NUM', 1))
step = int(math.ceil(float(args.total_images) /
args.batch_size)) * devices_num
learning_rate = fluid.layers.cosine_decay(
learning_rate=args.lr, step_each_epoch=step, epochs=args.num_epochs)
optimizer = fluid.optimizer.Momentum(
learning_rate=learning_rate,
momentum=args.momentum_rate,
regularization=fluid.regularizer.L2Decay(args.l2_decay))
return optimizer
return learning_rate, optimizer
def create_optimizer(args):
......@@ -118,9 +128,6 @@ def compress(args):
avg_cost = fluid.layers.mean(x=cost)
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
#print("="*50+"student_model_params"+"="*50)
#for v in student_program.list_vars():
# print(v.name, v.shape)
place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
......@@ -145,23 +152,19 @@ def compress(args):
name='image', shape=image_shape, dtype='float32')
predict = teacher_model.net(image, class_dim=class_dim)
#print("="*50+"teacher_model_params"+"="*50)
#for v in teacher_program.list_vars():
# print(v.name, v.shape)
exe.run(t_startup)
_download(
'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar',
'.')
_decompress('./ResNet50_pretrained.tar')
if not os.path.exists(args.teacher_pretrained_model):
_download(
'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar',
'.')
_decompress('./ResNet50_vd_pretrained.tar')
assert args.teacher_pretrained_model and os.path.exists(
args.teacher_pretrained_model
), "teacher_pretrained_model should be set when teacher_model is not None."
def if_exist(var):
return os.path.exists(
os.path.join(args.teacher_pretrained_model, var.name)
) and var.name != 'fc_0.w_0' and var.name != 'fc_0.b_0'
os.path.join(args.teacher_pretrained_model, var.name))
fluid.io.load_vars(
exe,
......@@ -173,9 +176,10 @@ def compress(args):
merge(teacher_program, student_program, data_name_map, place)
with fluid.program_guard(student_program, s_startup):
l2_loss = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", student_program)
loss = avg_cost + l2_loss
opt = create_optimizer(args)
distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0",
student_program)
loss = avg_cost + distill_loss
lr, opt = create_optimizer(args)
opt.minimize(loss)
exe.run(s_startup)
build_strategy = fluid.BuildStrategy()
......@@ -185,14 +189,17 @@ def compress(args):
for epoch_id in range(args.num_epochs):
for step_id, data in enumerate(train_loader):
loss_1, loss_2, loss_3 = exe.run(
lr_np, loss_1, loss_2, loss_3 = exe.run(
parallel_main,
feed=data,
fetch_list=[loss.name, avg_cost.name, l2_loss.name])
fetch_list=[
lr.name, loss.name, avg_cost.name, distill_loss.name
])
if step_id % args.log_period == 0:
_logger.info(
"train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}".
format(epoch_id, step_id, loss_1[0], loss_2[0], loss_3[0]))
"train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}".
format(epoch_id, step_id, lr_np[0], loss_1[0], loss_2[0],
loss_3[0]))
val_acc1s = []
val_acc5s = []
for step_id, data in enumerate(valid_loader):
......
from .mobilenet import MobileNet
from .resnet import ResNet34, ResNet50
from .resnet_vd import ResNet50_vd
from .mobilenet_v2 import MobileNetV2
from .pvanet import PVANet
__all__ = ['MobileNet', 'ResNet34', 'ResNet50', 'MobileNetV2', 'PVANet']
__all__ = [
'MobileNet', 'ResNet34', 'ResNet50', 'MobileNetV2', 'PVANet', 'ResNet50_vd'
]
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = [
"ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
"ResNet152_vd", "ResNet200_vd"
]
class ResNet():
def __init__(self, layers=50, is_3x3=False):
self.layers = layers
self.is_3x3 = is_3x3
def net(self, input, class_dim=1000):
is_3x3 = self.is_3x3
layers = self.layers
supported_layers = [18, 34, 50, 101, 152, 200]
assert layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, layers)
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
elif layers == 200:
depth = [3, 12, 48, 3]
num_filters = [64, 128, 256, 512]
if is_3x3 == False:
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
else:
conv = self.conv_bn_layer(
input=input,
num_filters=32,
filter_size=3,
stride=2,
act='relu',
name='conv1_1')
conv = self.conv_bn_layer(
input=conv,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_2')
conv = self.conv_bn_layer(
input=conv,
num_filters=64,
filter_size=3,
stride=1,
act='relu',
name='conv1_3')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152, 200] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0,
name=conv_name)
else:
for block in range(len(depth)):
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.basic_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0,
name=conv_name)
pool = fluid.layers.pool2d(
input=conv, pool_type='avg', global_pooling=True)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(
input=pool,
size=class_dim,
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)))
return out
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def conv_bn_layer_new(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
pool = fluid.layers.pool2d(
input=input,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
conv = fluid.layers.conv2d(
input=pool,
num_filters=num_filters,
filter_size=filter_size,
stride=1,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def shortcut(self, input, ch_out, stride, name, if_first=False):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return self.conv_bn_layer_new(
input, ch_out, 1, stride, name=name)
elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
act='relu',
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
act=None,
name=name + "_branch2c")
short = self.shortcut(
input,
num_filters * 4,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride,
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name + "_branch2b")
short = self.shortcut(
input,
num_filters,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
def ResNet18_vd():
model = ResNet(layers=18, is_3x3=True)
return model
def ResNet34_vd():
model = ResNet(layers=34, is_3x3=True)
return model
def ResNet50_vd():
model = ResNet(layers=50, is_3x3=True)
return model
def ResNet101_vd():
model = ResNet(layers=101, is_3x3=True)
return model
def ResNet152_vd():
model = ResNet(layers=152, is_3x3=True)
return model
def ResNet200_vd():
model = ResNet(layers=200, is_3x3=True)
return model
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册