diff --git a/demo/darts/README.md b/demo/darts/README.md index b5fb53a11349c0ead48d311e81281867b74dea08..92ce4079a1574453d418f72ac51d2d9549f2e582 100644 --- a/demo/darts/README.md +++ b/demo/darts/README.md @@ -2,9 +2,31 @@ 本示例介绍如何使用PaddlePaddle进行可微分架构搜索,可以直接使用[DARTS](https://arxiv.org/abs/1806.09055)和[PC-DARTS](https://arxiv.org/abs/1907.05737)两种方法,也支持自定义修改后使用其他可微分架构搜索算法。 +本示例目录结构如下: +``` +├── genotypes.py 搜索过程得到的模型结构Genotypes +│ +├── model.py 对搜索得到的子网络组网 +│ +├── model_search.py 对搜索前的超网络组网 +│ +├── operations.py 用于搜索的多种运算符组合 +│ +├── reader.py 数据读取与增广部分 +│ +├── search.py 模型结构搜索入口 +│ +├── train.py CIFAR10数据集评估训练入口 +│ +├── train_imagenet.py ImageNet数据集评估训练入口 +│ +├── visualize.py 模型结构可视化入口 + +``` + ## 依赖项 -> PaddlePaddle >= 1.7.0, graphviz >= 0.11.1 +PaddlePaddle >= 1.8.0, PaddleSlim >= 1.1.0, graphviz >= 0.11.1 ## 数据集 @@ -21,6 +43,14 @@ python search.py --unrolled=True # DARTS的二阶近似搜索方法 python search.py --method='PC-DARTS' --batch_size=256 --learning_rate=0.1 --arch_learning_rate=6e-4 --epochs_no_archopt=15 # PC-DARTS搜索方法 ``` +也可以使用多卡进行模型结构搜索,以4卡为例(GPU id: 0-3), 启动命令如下: + +```bash +python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog search.py --use_data_parallel 1 +``` + +因为使用多卡训练总的BatchSize会扩大n倍,n代表卡数,为了获得与单卡相当的准确率效果,请相应的将初始学习率扩大n倍。 + 模型结构随搜索轮数的变化如图1所示。需要注意的是,图中准确率Acc并不代表该结构最终准确率,为了获得当前结构的最佳准确率,请对得到的genotype做网络结构评估训练。 ![networks](images/networks.gif) @@ -40,6 +70,15 @@ python train.py --arch='PC_DARTS' # 在CIFAR10数据集上对搜索 python train_imagenet.py --arch='PC_DARTS' # 在ImageNet数据集上对搜索得到的结构评估训练 ``` +同样,也支持用多卡进行评估训练, 以4卡为例(GPU id: 0-3), 启动命令如下: + +```bash +python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train.py --use_data_parallel 1 --arch='DARTS_V2' +python -m paddle.distributed.launch --selected_gpus=0,1,2,3 --log_dir ./mylog train_imagenet.py --use_data_parallel 1 --arch='DARTS_V2' +``` + +同理,使用多卡训练总的BatchSize会扩大n倍,n代表卡数,为了获得与单卡相当的准确率效果,请相应的将初始学习率扩大n倍。 + 对搜索到的`DARTS_V1`、`DARTS_V2`和`PC-DARTS`做评估训练的结果如下: | 模型结构 | 数据集 | 准确率 | diff --git a/demo/darts/search.py b/demo/darts/search.py index 40e2c0d05670985675031604eab8f9148b521d93..f89408d2958c8ad9e37997b955ddc16aa66fd90d 100644 --- a/demo/darts/search.py +++ b/demo/darts/search.py @@ -80,6 +80,7 @@ def main(args): model, train_reader, valid_reader, + place, learning_rate=args.learning_rate, batchsize=args.batch_size, num_imgs=args.trainset_num, @@ -87,8 +88,8 @@ def main(args): unrolled=args.unrolled, num_epochs=args.epochs, epochs_no_archopt=args.epochs_no_archopt, - use_gpu=args.use_gpu, use_data_parallel=args.use_data_parallel, + save_dir=args.model_save_dir, log_freq=args.log_freq) searcher.train() diff --git a/demo/darts/train.py b/demo/darts/train.py index 77c8e3e54658e36fc5123aa3effd80bfb8354ddc..70c48f37b946c718ee32ecffb6a028b032a3363a 100644 --- a/demo/darts/train.py +++ b/demo/darts/train.py @@ -19,13 +19,14 @@ from __future__ import print_function import os import sys import ast +import logging import argparse import functools -import logging import paddle.fluid as fluid from paddle.fluid.dygraph.base import to_variable from paddleslim.common import AvgrageMeter, get_logger +from paddleslim.nas.darts import count_parameters_in_MB import genotypes import reader @@ -140,9 +141,6 @@ def main(args): if args.use_data_parallel else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): - if args.use_data_parallel: - strategy = fluid.dygraph.parallel.prepare_context() - genotype = eval("genotypes.%s" % args.arch) model = Network( C=args.init_channels, @@ -151,7 +149,12 @@ def main(args): auxiliary=args.auxiliary, genotype=genotype) - step_per_epoch = int(args.trainset_num / args.batch_size) + logger.info("param size = {:.6f}MB".format( + count_parameters_in_MB(model.parameters()))) + + device_num = fluid.dygraph.parallel.Env().nranks + step_per_epoch = int(args.trainset_num / + (args.batch_size * device_num)) learning_rate = fluid.dygraph.CosineDecay(args.learning_rate, step_per_epoch, args.epochs) clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=args.grad_clip) @@ -163,18 +166,21 @@ def main(args): grad_clip=clip) if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) train_loader = fluid.io.DataLoader.from_generator( - capacity=64, + capacity=1024, use_double_buffer=True, iterable=True, - return_list=True) + return_list=True, + use_multiprocess=True) valid_loader = fluid.io.DataLoader.from_generator( - capacity=64, + capacity=1024, use_double_buffer=True, iterable=True, - return_list=True) + return_list=True, + use_multiprocess=True) train_reader = reader.train_valid( batch_size=args.batch_size, @@ -186,13 +192,13 @@ def main(args): is_train=False, is_shuffle=False, args=args) - train_loader.set_batch_generator(train_reader, places=place) - valid_loader.set_batch_generator(valid_reader, places=place) - if args.use_data_parallel: train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) + train_loader.set_batch_generator(train_reader, places=place) + valid_loader.set_batch_generator(valid_reader, places=place) + save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) diff --git a/demo/darts/train_imagenet.py b/demo/darts/train_imagenet.py index f755f00f100519ff38aaed3fa869e06b4948e568..2bbf738ccc8c27395dfe228f30d157fe3115efe7 100644 --- a/demo/darts/train_imagenet.py +++ b/demo/darts/train_imagenet.py @@ -19,13 +19,15 @@ from __future__ import print_function import os import sys import ast +import logging import argparse import functools -import logging import paddle.fluid as fluid from paddle.fluid.dygraph.base import to_variable from paddleslim.common import AvgrageMeter, get_logger +from paddleslim.nas.darts import count_parameters_in_MB + import genotypes import reader from model import NetworkImageNet as Network @@ -152,9 +154,6 @@ def main(args): if args.use_data_parallel else fluid.CUDAPlace(0) with fluid.dygraph.guard(place): - if args.use_data_parallel: - strategy = fluid.dygraph.parallel.prepare_context() - genotype = eval("genotypes.%s" % args.arch) model = Network( C=args.init_channels, @@ -163,7 +162,12 @@ def main(args): auxiliary=args.auxiliary, genotype=genotype) - step_per_epoch = int(args.trainset_num / args.batch_size) + logger.info("param size = {:.6f}MB".format( + count_parameters_in_MB(model.parameters()))) + + device_num = fluid.dygraph.parallel.Env().nranks + step_per_epoch = int(args.trainset_num / + (args.batch_size * device_num)) learning_rate = fluid.dygraph.ExponentialDecay( args.learning_rate, step_per_epoch, @@ -179,6 +183,7 @@ def main(args): grad_clip=clip) if args.use_data_parallel: + strategy = fluid.dygraph.parallel.prepare_context() model = fluid.dygraph.parallel.DataParallel(model, strategy) train_loader = fluid.io.DataLoader.from_generator( @@ -199,20 +204,19 @@ def main(args): valid_reader = fluid.io.batch( reader.imagenet_reader(args.data_dir, 'val'), batch_size=args.batch_size) - - train_loader.set_sample_list_generator(train_reader, places=place) - valid_loader.set_sample_list_generator(valid_reader, places=place) - if args.use_data_parallel: train_reader = fluid.contrib.reader.distributed_batch_reader( train_reader) + train_loader.set_sample_list_generator(train_reader, places=place) + valid_loader.set_sample_list_generator(valid_reader, places=place) + save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) best_top1 = 0 for epoch in range(args.epochs): - logging.info('Epoch {}, lr {:.6f}'.format( + logger.info('Epoch {}, lr {:.6f}'.format( epoch, optimizer.current_step_lr())) train_top1, train_top5 = train(model, train_loader, optimizer, epoch, args) diff --git a/docs/zh_cn/api_cn/darts.rst b/docs/zh_cn/api_cn/darts.rst index e4ace282c4968c67c69f6cb519ed6bff4b963514..41f23e9c7ffa31425a47f9f504a900efc54c9ee1 100644 --- a/docs/zh_cn/api_cn/darts.rst +++ b/docs/zh_cn/api_cn/darts.rst @@ -97,7 +97,7 @@ DARTSearch model = SuperNet() train_reader = batch_generator_creator() valid_reader = batch_generator_creator() - searcher = DARTSearch(model, train_reader, valid_reader, unrolled=False) + searcher = DARTSearch(model, train_reader, valid_reader, place) searcher.train() .. diff --git a/paddleslim/nas/darts/architect.py b/paddleslim/nas/darts/architect.py index e2e90a0644071ec13f93862bb1c060e220fb7ea7..98d8d6ca7b7fb375f5ad8e0f55e6cf2e66c917e6 100644 --- a/paddleslim/nas/darts/architect.py +++ b/paddleslim/nas/darts/architect.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -21,7 +21,8 @@ from paddle.fluid.dygraph.base import to_variable class Architect(object): - def __init__(self, model, eta, arch_learning_rate, place, unrolled): + def __init__(self, model, eta, arch_learning_rate, place, unrolled, + parallel): self.network_momentum = 0.9 self.network_weight_decay = 3e-4 self.eta = eta @@ -34,6 +35,7 @@ class Architect(object): parameter_list=self.model.arch_parameters()) self.place = place self.unrolled = unrolled + self.parallel = parallel if self.unrolled: self.unrolled_model = self.model.new() self.unrolled_model_params = [ @@ -49,6 +51,17 @@ class Architect(object): self.network_weight_decay), parameter_list=self.unrolled_model_params) + if self.parallel: + strategy = fluid.dygraph.parallel.prepare_context() + self.parallel_model = fluid.dygraph.parallel.DataParallel( + self.model, strategy) + if self.unrolled: + self.parallel_unrolled_model = fluid.dygraph.parallel.DataParallel( + self.unrolled_model, strategy) + + def get_model(self): + return self.parallel_model if self.parallel else self.model + def step(self, input_train, target_train, input_valid, target_valid): if self.unrolled: params_grads = self._backward_step_unrolled( @@ -61,7 +74,12 @@ class Architect(object): def _backward_step(self, input_valid, target_valid): loss = self.model._loss(input_valid, target_valid) - loss.backward() + if self.parallel: + loss = self.parallel_model.scale_loss(loss) + loss.backward() + self.parallel_model.apply_collective_grads() + else: + loss.backward() return loss def _backward_step_unrolled(self, input_train, target_train, input_valid, @@ -69,7 +87,14 @@ class Architect(object): self._compute_unrolled_model(input_train, target_train) unrolled_loss = self.unrolled_model._loss(input_valid, target_valid) - unrolled_loss.backward() + if self.parallel: + unrolled_loss = self.parallel_unrolled_model.scale_loss( + unrolled_loss) + unrolled_loss.backward() + self.parallel_unrolled_model.apply_collective_grads() + else: + unrolled_loss.backward() + vector = [ to_variable(param._grad_ivar().numpy()) for param in self.unrolled_model_params @@ -93,7 +118,13 @@ class Architect(object): self.model.parameters()): x.value().get_tensor().set(y.numpy(), self.place) loss = self.unrolled_model._loss(input, target) - loss.backward() + if self.parallel: + loss = self.parallel_unrolled_model.scale_loss(loss) + loss.backward() + self.parallel_unrolled_model.apply_collective_grads() + else: + loss.backward() + self.unrolled_optimizer.minimize(loss) self.unrolled_model.clear_gradients() @@ -112,7 +143,13 @@ class Architect(object): param_p = param + grad * R param.value().get_tensor().set(param_p.numpy(), self.place) loss = self.model._loss(input, target) - loss.backward() + if self.parallel: + loss = self.parallel_model.scale_loss(loss) + loss.backward() + self.parallel_model.apply_collective_grads() + else: + loss.backward() + grads_p = [ to_variable(param._grad_ivar().numpy()) for param in self.model.arch_parameters() @@ -124,7 +161,13 @@ class Architect(object): self.model.clear_gradients() loss = self.model._loss(input, target) - loss.backward() + if self.parallel: + loss = self.parallel_model.scale_loss(loss) + loss.backward() + self.parallel_model.apply_collective_grads() + else: + loss.backward() + grads_n = [ to_variable(param._grad_ivar().numpy()) for param in self.model.arch_parameters() diff --git a/paddleslim/nas/darts/train_search.py b/paddleslim/nas/darts/train_search.py index 055e13774c6ba93a46b1c0b4f92d9ace7a6920a4..74d1b7eeee4b5b210f84ac3b9e9c8f4bf287c98e 100644 --- a/paddleslim/nas/darts/train_search.py +++ b/paddleslim/nas/darts/train_search.py @@ -16,8 +16,9 @@ from __future__ import absolute_import from __future__ import division from __future__ import print_function -__all__ = ['DARTSearch'] +__all__ = ['DARTSearch', 'count_parameters_in_MB'] +import os import logging import numpy as np import paddle.fluid as fluid @@ -67,19 +68,21 @@ class DARTSearch(object): model, train_reader, valid_reader, + place, learning_rate=0.025, batchsize=64, num_imgs=50000, arch_learning_rate=3e-4, - unrolled='False', + unrolled=False, num_epochs=50, epochs_no_archopt=0, - use_gpu=True, use_data_parallel=False, + save_dir='./', log_freq=50): self.model = model self.train_reader = train_reader self.valid_reader = valid_reader + self.place = place, self.learning_rate = learning_rate self.batchsize = batchsize self.num_imgs = num_imgs @@ -87,14 +90,8 @@ class DARTSearch(object): self.unrolled = unrolled self.epochs_no_archopt = epochs_no_archopt self.num_epochs = num_epochs - self.use_gpu = use_gpu self.use_data_parallel = use_data_parallel - if not self.use_gpu: - self.place = fluid.CPUPlace() - elif not self.use_data_parallel: - self.place = fluid.CUDAPlace(0) - else: - self.place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) + self.save_dir = save_dir self.log_freq = log_freq def train_one_epoch(self, train_loader, valid_loader, architect, optimizer, @@ -187,9 +184,13 @@ class DARTSearch(object): ] logger.info("param size = {:.6f}MB".format( count_parameters_in_MB(model_parameters))) - step_per_epoch = int(self.num_imgs * 0.5 / self.batchsize) + + device_num = fluid.dygraph.parallel.Env().nranks + step_per_epoch = int(self.num_imgs * 0.5 / + (self.batchsize * device_num)) if self.unrolled: step_per_epoch *= 2 + learning_rate = fluid.dygraph.CosineDecay( self.learning_rate, step_per_epoch, self.num_epochs) @@ -202,30 +203,37 @@ class DARTSearch(object): grad_clip=clip) if self.use_data_parallel: - self.model = fluid.dygraph.parallel.DataParallel(self.model, - strategy) self.train_reader = fluid.contrib.reader.distributed_batch_reader( self.train_reader) self.valid_reader = fluid.contrib.reader.distributed_batch_reader( self.valid_reader) train_loader = fluid.io.DataLoader.from_generator( - capacity=64, + capacity=1024, use_double_buffer=True, iterable=True, - return_list=True) + return_list=True, + use_multiprocess=True) valid_loader = fluid.io.DataLoader.from_generator( - capacity=64, + capacity=1024, use_double_buffer=True, iterable=True, - return_list=True) + return_list=True, + use_multiprocess=True) train_loader.set_batch_generator(self.train_reader, places=self.place) valid_loader.set_batch_generator(self.valid_reader, places=self.place) - architect = Architect(self.model, learning_rate, - self.arch_learning_rate, self.place, - self.unrolled) + base_model = self.model + architect = Architect( + model=self.model, + eta=learning_rate, + arch_learning_rate=self.arch_learning_rate, + place=self.place, + unrolled=self.unrolled, + parallel=self.use_data_parallel) + + self.model = architect.get_model() save_parameters = (not self.use_data_parallel) or ( self.use_data_parallel and @@ -234,7 +242,8 @@ class DARTSearch(object): for epoch in range(self.num_epochs): logger.info('Epoch {}, lr {:.6f}'.format( epoch, optimizer.current_step_lr())) - genotype = get_genotype(self.model) + + genotype = get_genotype(base_model) logger.info('genotype = %s', genotype) train_top1 = self.train_one_epoch(train_loader, valid_loader, @@ -246,4 +255,6 @@ class DARTSearch(object): logger.info("Epoch {}, valid_acc {:.6f}".format(epoch, valid_top1)) if save_parameters: - fluid.save_dygraph(self.model.state_dict(), "./weights") + fluid.save_dygraph( + self.model.state_dict(), + os.path.join(self.save_dir, str(epoch), "params"))