diff --git a/demo/DML/README.md b/demo/deep_mutual_learning/README.md similarity index 71% rename from demo/DML/README.md rename to demo/deep_mutual_learning/README.md index fab0faafca2bfcca2931882a7e9a75a3370e82e5..63f80292d66b78b8ce591c6908d220ae5b0d5e65 100755 --- a/demo/DML/README.md +++ b/demo/deep_mutual_learning/README.md @@ -1,22 +1,44 @@ # 深度互学习DML(Deep Mutual Learning) 本示例介绍如何使用PaddleSlim的深度互学习DML方法训练模型,算法原理请参考论文 [Deep Mutual Learning](https://arxiv.org/abs/1706.00384) +![dml_architect](./images/dml_architect.png) + ## 使用数据 + 示例中使用cifar100数据集进行训练, 您可以在启动训练时等待自动下载, 也可以在自行下载[数据集](https://www.cs.toronto.edu/~kriz/cifar-100-python.tar.gz)之后,放在当前目录的`./dataset/cifar100`路径下 ## 启动命令 +### 训练MobileNet-Mobilenet的组合 + 单卡训练, 以0号GPU为例: + ```bash CUDA_VISIBLE_DEVICES=0 python dml_train.py ``` 多卡训练, 以0-3号GPU为例: ```bash -python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog dml_train.py --use_parallel=True +python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog dml_train.py --use_parallel=True --init_lr=0.4 ``` +### 训练MobileNet-ResNet50的组合 + +单卡训练, 以0号GPU为例: + +```bash +CUDA_VISIBLE_DEVICES=0 python dml_train.py --models='mobilenet-resnet50' +``` + +多卡训练, 以0-3号GPU为例: + +```bash +python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog dml_train.py --use_parallel=True --init_lr=0.4 --models='mobilenet-resnet50' +``` + + + ## 实验结果 以下实验结果可以由默认实验配置(学习率、优化器等)训练得到,仅调整了DML训练的模型组合 diff --git a/demo/DML/cifar100_reader.py b/demo/deep_mutual_learning/cifar100_reader.py similarity index 97% rename from demo/DML/cifar100_reader.py rename to demo/deep_mutual_learning/cifar100_reader.py index 325ed9f0724ba3ef2ccf39e29665f8008861165f..6009e5fb468314b21e324d1a7d10d8f94e3ec08f 100755 --- a/demo/DML/cifar100_reader.py +++ b/demo/deep_mutual_learning/cifar100_reader.py @@ -102,8 +102,7 @@ def cifar100_reader(file_name, data_name, is_shuffle): for name in names: print("Reading file " + name) try: - batch = cPickle.load( - f.extractfile(name), encoding='iso-8859-1') + batch = cPickle.load(f.extractfile(name), encoding='iso-8859-1') except: batch = cPickle.load(f.extractfile(name)) data = batch['data'] diff --git a/demo/DML/dml_train.py b/demo/deep_mutual_learning/dml_train.py similarity index 92% rename from demo/DML/dml_train.py rename to demo/deep_mutual_learning/dml_train.py index cbe7ff42a5e53c61d37ac102b0a646e7825c9bda..f6d8dffb182372390d1842054311f0c09567aff5 100755 --- a/demo/DML/dml_train.py +++ b/demo/deep_mutual_learning/dml_train.py @@ -26,6 +26,7 @@ from paddle.fluid.dygraph.base import to_variable from paddleslim.common import AvgrageMeter, get_logger from paddleslim.dist import DML from paddleslim.models.dygraph import MobileNetV1 +from paddleslim.models.dygraph import ResNet import cifar100_reader as reader sys.path[0] = os.path.join(os.path.dirname("__file__"), os.path.pardir) from utility import add_arguments, print_arguments @@ -37,6 +38,7 @@ add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable add_arg('log_freq', int, 100, "Log frequency.") +add_arg('models', str, "mobilenet-mobilenet", "model.") add_arg('batch_size', int, 256, "Minibatch size.") add_arg('init_lr', float, 0.1, "The start learning rate.") add_arg('use_gpu', bool, True, "Whether use GPU.") @@ -44,7 +46,6 @@ add_arg('epochs', int, 200, "Epoch number.") add_arg('class_num', int, 100, "Class number of dataset.") add_arg('trainset_num', int, 50000, "Images number of trainset.") add_arg('model_save_dir', str, 'saved_models', "The path to save model.") -add_arg('use_multiprocess', bool, True, "Whether use multiprocess reader.") add_arg('use_parallel', bool, False, "Whether to use data parallel mode to train the model.") # yapf: enable @@ -78,13 +79,9 @@ def create_reader(place, args): train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) train_loader = fluid.io.DataLoader.from_generator( - capacity=1024, - return_list=True, - use_multiprocess=args.use_multiprocess) + capacity=1024, return_list=True) valid_loader = fluid.io.DataLoader.from_generator( - capacity=1024, - return_list=True, - use_multiprocess=args.use_multiprocess) + capacity=1024, return_list=True) train_loader.set_batch_generator(train_reader, places=place) valid_loader.set_batch_generator(valid_reader, places=place) return train_loader, valid_loader @@ -160,10 +157,19 @@ def main(args): train_loader, valid_loader = create_reader(place, args) # 2. Define neural network - models = [ - MobileNetV1(class_dim=args.class_num), - MobileNetV1(class_dim=args.class_num) - ] + if args.models == "mobilenet-mobilenet": + models = [ + MobileNetV1(class_dim=args.class_num), + MobileNetV1(class_dim=args.class_num) + ] + elif args.models == "mobilenet-resnet50": + models = [ + MobileNetV1(class_dim=args.class_num), + ResNet(class_dim=args.class_num) + ] + else: + logger.info("You can define the model as you wish") + return optimizers = create_optimizer(models, args) # 3. Use PaddleSlim DML strategy diff --git a/demo/deep_mutual_learning/images/dml_architect.png b/demo/deep_mutual_learning/images/dml_architect.png new file mode 100755 index 0000000000000000000000000000000000000000..24f257aa81b3d95ac9e78c1508315ed90992e9f4 Binary files /dev/null and b/demo/deep_mutual_learning/images/dml_architect.png differ diff --git a/paddleslim/dist/dml.py b/paddleslim/dist/dml.py index 2ad1c9420b3228931a8573d46754f073967fa25c..0eba61498fef52bb2072d20e360b4e75e6988b10 100755 --- a/paddleslim/dist/dml.py +++ b/paddleslim/dist/dml.py @@ -17,11 +17,19 @@ from __future__ import division from __future__ import print_function import copy +import paddle import paddle.fluid as fluid +PADDLE_VERSION = 1.8 +try: + from paddle.fluid.layers import log_softmax +except: + from paddle.nn import LogSoftmax + PADDLE_VERSION = 2.0 + class DML(fluid.dygraph.Layer): - def __init__(self, model, use_parallel): + def __init__(self, model, use_parallel=False): super(DML, self).__init__() self.model = model self.use_parallel = use_parallel @@ -54,8 +62,7 @@ class DML(fluid.dygraph.Layer): for i in range(self.model_num): ce_losses.append( fluid.layers.mean( - fluid.layers.softmax_with_cross_entropy(logits[i], - labels))) + fluid.layers.softmax_with_cross_entropy(logits[i], labels))) return ce_losses def kl_loss(self, logits): @@ -69,7 +76,11 @@ class DML(fluid.dygraph.Layer): cur_kl_loss = 0 for j in range(self.model_num): if i != j: - x = fluid.layers.log_softmax(logits[i], axis=1) + if PADDLE_VERSION == 2.0: + log_softmax = LogSoftmax(axis=1) + x = log_softmax(logits[i]) + else: + x = fluid.layers.log_softmax(logits[i], axis=1) y = fluid.layers.softmax(logits[j], axis=1) cur_kl_loss += fluid.layers.kldiv_loss( x, y, reduction='batchmean') diff --git a/tests/test_deep_mutual_learning.py b/tests/test_deep_mutual_learning.py new file mode 100755 index 0000000000000000000000000000000000000000..60762e330b25d868dcaba98b1e45cc0c1d47dedc --- /dev/null +++ b/tests/test_deep_mutual_learning.py @@ -0,0 +1,99 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License" +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys +sys.path.append("../") +import unittest +import logging +import numpy as np +import paddle +import paddle.fluid as fluid +import paddle.dataset.mnist as reader +from paddle.fluid.dygraph.base import to_variable +from paddleslim.models.dygraph import MobileNetV1 +from paddleslim.dist import DML +from paddleslim.common import get_logger +logger = get_logger(__name__, level=logging.INFO) + + +class Model(fluid.dygraph.Layer): + def __init__(self): + super(Model, self).__init__() + self.conv = fluid.dygraph.nn.Conv2D( + num_channels=1, + num_filters=256, + filter_size=3, + stride=1, + padding=1, + use_cudnn=False) + self.pool2d_avg = fluid.dygraph.nn.Pool2D( + pool_type='avg', global_pooling=True) + self.out = fluid.dygraph.nn.Linear(256, 10) + + def forward(self, inputs): + inputs = fluid.layers.reshape(inputs, shape=[0, 1, 28, 28]) + y = self.conv(inputs) + y = self.pool2d_avg(y) + y = fluid.layers.reshape(y, shape=[-1, 256]) + y = self.out(y) + return y + + +class TestDML(unittest.TestCase): + def test_dml(self): + place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda( + ) else fluid.CPUPlace() + with fluid.dygraph.guard(place): + train_reader = paddle.fluid.io.batch( + paddle.dataset.mnist.train(), batch_size=256) + train_loader = fluid.io.DataLoader.from_generator( + capacity=1024, return_list=True) + train_loader.set_sample_list_generator(train_reader, places=place) + + models = [Model(), Model()] + optimizers = [] + for cur_model in models: + opt = fluid.optimizer.MomentumOptimizer( + 0.1, 0.9, parameter_list=cur_model.parameters()) + optimizers.append(opt) + dml_model = DML(models) + dml_optimizer = dml_model.opt(optimizers) + + def train(train_loader, dml_model, dml_optimizer): + dml_model.train() + for step_id, (images, labels) in enumerate(train_loader): + images, labels = to_variable(images), to_variable(labels) + labels = fluid.layers.reshape(labels, [0, 1]) + + logits = dml_model.forward(images) + precs = [ + fluid.layers.accuracy( + input=l, label=labels, k=1).numpy() for l in logits + ] + losses = dml_model.loss(logits, labels) + dml_optimizer.minimize(losses) + if step_id % 10 == 0: + print(step_id, precs) + + for epoch_id in range(10): + current_step_lr = dml_optimizer.get_lr() + lr_msg = "Epoch {}".format(epoch_id) + for model_id, lr in enumerate(current_step_lr): + lr_msg += ", {} lr: {:.6f}".format( + dml_model.full_name()[model_id], lr) + logger.info(lr_msg) + train(train_loader, dml_model, dml_optimizer) + + +if __name__ == '__main__': + unittest.main()