diff --git a/demo/distillation/train.py b/demo/distillation/train.py new file mode 100644 index 0000000000000000000000000000000000000000..7f389168440a59f0872d44ab6e62f262e373f6f0 --- /dev/null +++ b/demo/distillation/train.py @@ -0,0 +1,238 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import sys +import math +import logging +import paddle +import argparse +import functools +import numpy as np +import paddle.fluid as fluid +sys.path.append(sys.path[0] + "/../") +import models +import imagenet_reader as reader +from utility import add_arguments, print_arguments +from paddleslim.dist import merge, l2_loss, soft_label_loss, fsp_loss + +logging.basicConfig(format='%(asctime)s-%(levelname)s: %(message)s') +_logger = logging.getLogger(__name__) +_logger.setLevel(logging.INFO) + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 64*4, "Minibatch size.") +add_arg('use_gpu', bool, True, "Whether to use GPU or not.") +add_arg('total_images', int, 1281167, "Training image number.") +add_arg('image_shape', str, "3,224,224", "Input image size") +add_arg('lr', float, 0.1, "The learning rate used to fine-tune pruned model.") +add_arg('lr_strategy', str, "piecewise_decay", "The learning rate decay strategy.") +add_arg('l2_decay', float, 3e-5, "The l2_decay parameter.") +add_arg('momentum_rate', float, 0.9, "The value of momentum_rate.") +add_arg('num_epochs', int, 120, "The number of total epochs.") +add_arg('data', str, "mnist", "Which data to use. 'mnist' or 'imagenet'") +add_arg('log_period', int, 20, "Log period in batches.") +add_arg('model', str, "MobileNet", "Set the network to use.") +add_arg('pretrained_model', str, None, "Whether to use pretrained model.") +add_arg('teacher_model', str, "ResNet50", "Set the teacher network to use.") +add_arg('teacher_pretrained_model', str, "../pretrain/ResNet50_pretrained", "Whether to use pretrained model.") +parser.add_argument('--step_epochs', nargs='+', type=int, default=[30, 60, 90], help="piecewise decay step") +# yapf: enable + +model_list = [m for m in dir(models) if "__" not in m] + + +def piecewise_decay(args): + step = int(math.ceil(float(args.total_images) / args.batch_size)) + bd = [step * e for e in args.step_epochs] + lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)] + learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr) + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=args.momentum_rate, + regularization=fluid.regularizer.L2Decay(args.l2_decay)) + return optimizer + + +def cosine_decay(args): + step = int(math.ceil(float(args.total_images) / args.batch_size)) + learning_rate = fluid.layers.cosine_decay( + learning_rate=args.lr, step_each_epoch=step, epochs=args.num_epochs) + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=args.momentum_rate, + regularization=fluid.regularizer.L2Decay(args.l2_decay)) + return optimizer + + +def create_optimizer(args): + if args.lr_strategy == "piecewise_decay": + return piecewise_decay(args) + elif args.lr_strategy == "cosine_decay": + return cosine_decay(args) + + +def compress(args): + if args.data == "mnist": + import paddle.dataset.mnist as reader + train_reader = reader.train() + val_reader = reader.test() + class_dim = 10 + image_shape = "1,28,28" + elif args.data == "imagenet": + import imagenet_reader as reader + train_reader = reader.train() + val_reader = reader.val() + class_dim = 1000 + image_shape = "3,224,224" + else: + raise ValueError("{} is not supported.".format(args.data)) + image_shape = [int(m) for m in image_shape.split(",")] + + assert args.model in model_list, "{} is not in lists: {}".format( + args.model, model_list) + student_program = fluid.Program() + s_startup = fluid.Program() + + with fluid.program_guard(student_program, s_startup): + with fluid.unique_name.guard(): + image = fluid.layers.data( + name='image', shape=image_shape, dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + train_loader = fluid.io.DataLoader.from_generator( + feed_list=[image, label], + capacity=64, + use_double_buffer=True, + iterable=True) + valid_loader = fluid.io.DataLoader.from_generator( + feed_list=[image, label], + capacity=64, + use_double_buffer=True, + iterable=True) + # model definition + model = models.__dict__[args.model]() + out = model.net(input=image, class_dim=class_dim) + cost = fluid.layers.cross_entropy(input=out, label=label) + avg_cost = fluid.layers.mean(x=cost) + acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) + #print("="*50+"student_model_params"+"="*50) + #for v in student_program.list_vars(): + # print(v.name, v.shape) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + + train_reader = paddle.batch( + train_reader, batch_size=args.batch_size, drop_last=True) + val_reader = paddle.batch( + val_reader, batch_size=args.batch_size, drop_last=True) + val_program = student_program.clone(for_test=True) + + places = fluid.cuda_places() + train_loader.set_sample_list_generator(train_reader, places) + valid_loader.set_sample_list_generator(val_reader, place) + + teacher_model = models.__dict__[args.teacher_model]() + # define teacher program + teacher_program = fluid.Program() + t_startup = fluid.Program() + teacher_scope = fluid.Scope() + with fluid.scope_guard(teacher_scope): + with fluid.program_guard(teacher_program, t_startup): + with fluid.unique_name.guard(): + image = fluid.layers.data( + name='image', shape=image_shape, dtype='float32') + predict = teacher_model.net(image, class_dim=class_dim) + + #print("="*50+"teacher_model_params"+"="*50) + #for v in teacher_program.list_vars(): + # print(v.name, v.shape) + + exe.run(t_startup) + assert args.teacher_pretrained_model and os.path.exists( + args.teacher_pretrained_model + ), "teacher_pretrained_model should be set when teacher_model is not None." + + def if_exist(var): + return os.path.exists( + os.path.join(args.teacher_pretrained_model, var.name) + ) and var.name != 'conv1_weights' and var.name != 'fc_0.w_0' and var.name != 'fc_0.b_0' + + fluid.io.load_vars( + exe, + args.teacher_pretrained_model, + main_program=teacher_program, + predicate=if_exist) + + data_name_map = {'image': 'image'} + main = merge( + teacher_program, + student_program, + data_name_map, + place, + teacher_scope=teacher_scope) + + #print("="*50+"teacher_vars"+"="*50) + #for v in teacher_program.list_vars(): + # if '_generated_var' not in v.name and 'fetch' not in v.name and 'feed' not in v.name: + # print(v.name, v.shape) + #return + + with fluid.program_guard(main, s_startup): + l2_loss_v = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", main) + fsp_loss_v = fsp_loss("teacher_res2a_branch2a.conv2d.output.1.tmp_0", + "teacher_res3a_branch2a.conv2d.output.1.tmp_0", + "depthwise_conv2d_1.tmp_0", "conv2d_3.tmp_0", + main) + loss = avg_cost + l2_loss_v + fsp_loss_v + opt = create_optimizer(args) + opt.minimize(loss) + exe.run(s_startup) + build_strategy = fluid.BuildStrategy() + build_strategy.fuse_all_reduce_ops = False + parallel_main = fluid.CompiledProgram(main).with_data_parallel( + loss_name=loss.name, build_strategy=build_strategy) + + for epoch_id in range(args.num_epochs): + for step_id, data in enumerate(train_loader): + loss_1, loss_2, loss_3, loss_4 = exe.run( + parallel_main, + feed=data, + fetch_list=[ + loss.name, avg_cost.name, l2_loss_v.name, fsp_loss_v.name + ]) + if step_id % args.log_period == 0: + _logger.info( + "train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}, fsp loss {:.6f}". + format(epoch_id, step_id, loss_1[0], loss_2[0], loss_3[0], + loss_4[0])) + val_acc1s = [] + val_acc5s = [] + for step_id, data in enumerate(valid_loader): + val_loss, val_acc1, val_acc5 = exe.run( + val_program, + data, + fetch_list=[avg_cost.name, acc_top1.name, acc_top5.name]) + val_acc1s.append(val_acc1) + val_acc5s.append(val_acc5) + if step_id % args.log_period == 0: + _logger.info( + "valid_epoch {} step {} loss {:.6f}, top1 {:.6f}, top5 {:.6f}". + format(epoch_id, step_id, val_loss[0], val_acc1[0], + val_acc5[0])) + _logger.info("epoch {} top1 {:.6f}, top5 {:.6f}".format( + epoch_id, np.mean(val_acc1s), np.mean(val_acc5s))) + + +def main(): + args = parser.parse_args() + print_arguments(args) + compress(args) + + +if __name__ == '__main__': + main() diff --git a/demo/nas/sa_nas_mobilenetv2.py b/demo/nas/sa_nas_mobilenetv2.py new file mode 100644 index 0000000000000000000000000000000000000000..142c2c08f09e7888ab255b1d6ce762a50c8e1966 --- /dev/null +++ b/demo/nas/sa_nas_mobilenetv2.py @@ -0,0 +1,276 @@ +import sys +sys.path.append('..') +import numpy as np +import argparse +import ast +import time +import argparse +import ast +import logging +import paddle +import paddle.fluid as fluid +from paddleslim.nas.search_space.search_space_factory import SearchSpaceFactory +from paddleslim.analysis import flops +from paddleslim.nas import SANAS +from paddleslim.common import get_logger +from optimizer import create_optimizer +import imagenet_reader + +_logger = get_logger(__name__, level=logging.INFO) + + +def create_data_loader(image_shape): + data_shape = [-1] + image_shape + data = fluid.data(name='data', shape=data_shape, dtype='float32') + label = fluid.data(name='label', shape=[-1, 1], dtype='int64') + data_loader = fluid.io.DataLoader.from_generator( + feed_list=[data, label], + capacity=1024, + use_double_buffer=True, + iterable=True) + return data_loader, data, label + + +def build_program(main_program, + startup_program, + image_shape, + archs, + args, + is_test=False): + with fluid.program_guard(main_program, startup_program): + data_loader, data, label = create_data_loader(image_shape) + output = archs(data) + + softmax_out = fluid.layers.softmax(input=output, use_cudnn=False) + cost = fluid.layers.cross_entropy(input=softmax_out, label=label) + avg_cost = fluid.layers.mean(cost) + acc_top1 = fluid.layers.accuracy(input=softmax_out, label=label, k=1) + acc_top5 = fluid.layers.accuracy(input=softmax_out, label=label, k=5) + + if is_test == False: + optimizer = create_optimizer(args) + optimizer.minimize(avg_cost) + return data_loader, avg_cost, acc_top1, acc_top5 + + +def search_mobilenetv2(config, args, image_size, is_server=True): + factory = SearchSpaceFactory() + space = factory.get_search_space(config) + if is_server: + ### start a server and a client + sa_nas = SANAS( + config, + server_addr=("", 8883), + init_temperature=args.init_temperature, + reduce_rate=args.reduce_rate, + search_steps=args.search_steps, + is_server=True) + else: + ### start a client + sa_nas = SANAS( + config, + server_addr=("10.255.125.38", 8883), + init_temperature=args.init_temperature, + reduce_rate=args.reduce_rate, + search_steps=args.search_steps, + is_server=False) + + image_shape = [3, image_size, image_size] + for step in range(args.search_steps): + archs = sa_nas.next_archs()[0] + + train_program = fluid.Program() + test_program = fluid.Program() + startup_program = fluid.Program() + train_loader, avg_cost, acc_top1, acc_top5 = build_program( + train_program, startup_program, image_shape, archs, args) + + current_flops = flops(train_program) + print('step: {}, current_flops: {}'.format(step, current_flops)) + if current_flops > args.max_flops: + continue + + test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( + test_program, + startup_program, + image_shape, + archs, + args, + is_test=True) + test_program = test_program.clone(for_test=True) + + place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() + exe = fluid.Executor(place) + exe.run(startup_program) + + if args.data == 'cifar10': + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.cifar.train10(cycle=False), buf_size=1024), + batch_size=args.batch_size, + drop_last=True) + + test_reader = paddle.batch( + paddle.dataset.cifar.test10(cycle=False), + batch_size=args.batch_size, + drop_last=False) + elif args.data == 'imagenet': + train_reader = paddle.batch( + imagenet_reader.train(), + batch_size=args.batch_size, + drop_last=True) + test_reader = paddle.batch( + imagenet_reader.val(), + batch_size=args.batch_size, + drop_last=False) + + #test_loader, _, _ = create_data_loader(image_shape) + train_loader.set_sample_list_generator( + train_reader, + places=fluid.cuda_places() if args.use_gpu else fluid.cpu_places()) + test_loader.set_sample_list_generator(test_reader, places=place) + + build_strategy = fluid.BuildStrategy() + train_compiled_program = fluid.CompiledProgram( + train_program).with_data_parallel( + loss_name=avg_cost.name, build_strategy=build_strategy) + for epoch_id in range(args.retain_epoch): + for batch_id, data in enumerate(train_loader()): + fetches = [avg_cost.name] + s_time = time.time() + outs = exe.run(train_compiled_program, + feed=data, + fetch_list=fetches)[0] + batch_time = time.time() - s_time + if batch_id % 10 == 0: + _logger.info( + 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms'. + format(step, epoch_id, batch_id, outs[0], batch_time)) + + reward = [] + for batch_id, data in enumerate(test_loader()): + test_fetches = [ + test_avg_cost.name, test_acc_top1.name, test_acc_top5.name + ] + batch_reward = exe.run(test_program, + feed=data, + fetch_list=test_fetches) + reward_avg = np.mean(np.array(batch_reward), axis=1) + reward.append(reward_avg) + + _logger.info( + 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'. + format(step, batch_id, batch_reward[0], batch_reward[1], + batch_reward[2])) + + finally_reward = np.mean(np.array(reward), axis=0) + _logger.info( + 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( + finally_reward[0], finally_reward[1], finally_reward[2])) + + sa_nas.reward(float(finally_reward[1])) + + +if __name__ == '__main__': + + parser = argparse.ArgumentParser( + description='SA NAS MobileNetV2 cifar10 argparase') + parser.add_argument( + '--use_gpu', + type=ast.literal_eval, + default=True, + help='Whether to use GPU in train/test model.') + parser.add_argument( + '--batch_size', type=int, default=256, help='batch size.') + parser.add_argument( + '--data', + type=str, + default='cifar10', + choices=['cifar10', 'imagenet'], + help='server address.') + # controller + parser.add_argument( + '--reduce_rate', type=float, default=0.85, help='reduce rate.') + parser.add_argument( + '--init_temperature', + type=float, + default=10.24, + help='init temperature.') + parser.add_argument( + '--is_server', + type=ast.literal_eval, + default=True, + help='Whether to start a server.') + # nas args + parser.add_argument( + '--max_flops', type=int, default=592948064, help='reduce rate.') + parser.add_argument( + '--retain_epoch', type=int, default=5, help='train epoch before val.') + parser.add_argument( + '--end_epoch', type=int, default=500, help='end epoch present client.') + parser.add_argument( + '--search_steps', + type=int, + default=100, + help='controller server number.') + parser.add_argument( + '--server_address', type=str, default=None, help='server address.') + # optimizer args + parser.add_argument( + '--lr_strategy', + type=str, + default='piecewise_decay', + help='learning rate decay strategy.') + parser.add_argument('--lr', type=float, default=0.1, help='learning rate.') + parser.add_argument( + '--l2_decay', type=float, default=1e-4, help='learning rate decay.') + parser.add_argument( + '--step_epochs', + nargs='+', + type=int, + default=[30, 60, 90], + help="piecewise decay step") + parser.add_argument( + '--momentum_rate', + type=float, + default=0.9, + help='learning rate decay.') + parser.add_argument( + '--warm_up_epochs', + type=float, + default=5.0, + help='learning rate decay.') + parser.add_argument( + '--num_epochs', type=int, default=120, help='learning rate decay.') + parser.add_argument( + '--decay_epochs', type=float, default=2.4, help='learning rate decay.') + parser.add_argument( + '--decay_rate', type=float, default=0.97, help='learning rate decay.') + parser.add_argument( + '--total_images', + type=int, + default=1281167, + help='learning rate decay.') + args = parser.parse_args() + print(args) + + if args.data == 'cifar10': + image_size = 32 + block_num = 3 + elif args.data == 'imagenet': + image_size = 224 + block_num = 6 + else: + raise NotImplemented( + 'data must in [cifar10, imagenet], but received: {}'.format( + args.data)) + + config_info = { + 'input_size': image_size, + 'output_size': 1, + 'block_num': block_num, + 'block_mask': None + } + config = [('MobileNetV2Space', config_info)] + + search_mobilenetv2(config, args, image_size, is_server=args.is_server) diff --git a/demo/nas/sa_nas_mobilenetv2_cifar10.py b/demo/nas/sa_nas_mobilenetv2_cifar10.py deleted file mode 100644 index 249d4c214788c0ffc5a0d741dc48b4942ea5808b..0000000000000000000000000000000000000000 --- a/demo/nas/sa_nas_mobilenetv2_cifar10.py +++ /dev/null @@ -1,122 +0,0 @@ -import sys -sys.path.append('..') -import numpy as np -import argparse -import ast -import paddle -import paddle.fluid as fluid -from paddleslim.nas.search_space.search_space_factory import SearchSpaceFactory -from paddleslim.analysis import flops -from paddleslim.nas import SANAS - - -def create_data_loader(): - data = fluid.data(name='data', shape=[-1, 3, 32, 32], dtype='float32') - label = fluid.data(name='label', shape=[-1, 1], dtype='int64') - data_loader = fluid.io.DataLoader.from_generator( - feed_list=[data, label], - capacity=1024, - use_double_buffer=True, - iterable=True) - return data_loader, data, label - - -def init_sa_nas(config): - factory = SearchSpaceFactory() - space = factory.get_search_space(config) - model_arch = space.token2arch()[0] - main_program = fluid.Program() - startup_program = fluid.Program() - - with fluid.program_guard(main_program, startup_program): - data_loader, data, label = create_data_loader() - output = model_arch(data) - cost = fluid.layers.mean( - fluid.layers.softmax_with_cross_entropy( - logits=output, label=label)) - - base_flops = flops(main_program) - search_steps = 10000000 - - ### start a server and a client - sa_nas = SANAS(config, search_steps=search_steps, is_server=True) - - ### start a client, server_addr is server address - #sa_nas = SANAS(config, max_flops = base_flops, server_addr=("10.255.125.38", 18607), search_steps = search_steps, is_server=False) - - return sa_nas, search_steps - - -def search_mobilenetv2_cifar10(config, args): - sa_nas, search_steps = init_sa_nas(config) - for i in range(search_steps): - print('search step: ', i) - archs = sa_nas.next_archs()[0] - train_program = fluid.Program() - test_program = fluid.Program() - startup_program = fluid.Program() - with fluid.program_guard(train_program, startup_program): - train_loader, data, label = create_data_loader() - output = archs(data) - cost = fluid.layers.mean( - fluid.layers.softmax_with_cross_entropy( - logits=output, label=label))[0] - test_program = train_program.clone(for_test=True) - - optimizer = fluid.optimizer.Momentum( - learning_rate=0.1, - momentum=0.9, - regularization=fluid.regularizer.L2Decay(1e-4)) - optimizer.minimize(cost) - - place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() - exe = fluid.Executor(place) - exe.run(startup_program) - train_reader = paddle.reader.shuffle( - paddle.dataset.cifar.train10(cycle=False), buf_size=1024) - train_loader.set_sample_generator( - train_reader, - batch_size=512, - places=fluid.cuda_places() if args.use_gpu else fluid.cpu_places()) - - test_loader, _, _ = create_data_loader() - test_reader = paddle.dataset.cifar.test10(cycle=False) - test_loader.set_sample_generator( - test_reader, - batch_size=256, - drop_last=False, - places=fluid.cuda_places() if args.use_gpu else fluid.cpu_places()) - - for epoch_id in range(10): - for batch_id, data in enumerate(train_loader()): - loss = exe.run(train_program, - feed=data, - fetch_list=[cost.name])[0] - if batch_id % 5 == 0: - print('epoch: {}, batch: {}, loss: {}'.format( - epoch_id, batch_id, loss[0])) - - for data in test_loader(): - reward = exe.run(test_program, feed=data, - fetch_list=[cost.name])[0] - - print('reward:', reward) - sa_nas.reward(float(reward)) - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser( - description='SA NAS MobileNetV2 cifar10 argparase') - parser.add_argument( - '--use_gpu', - type=ast.literal_eval, - default=True, - help='Whether to use GPU in train/test model.') - args = parser.parse_args() - print(args) - - config_info = {'input_size': 32, 'output_size': 1, 'block_num': 5} - config = [('MobileNetV2Space', config_info)] - - search_mobilenetv2_cifar10(config, args) diff --git a/demo/optimizer.py b/demo/optimizer.py new file mode 100644 index 0000000000000000000000000000000000000000..73f441f897d22c10d2d6e05afaa7491b227b27d4 --- /dev/null +++ b/demo/optimizer.py @@ -0,0 +1,300 @@ +#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle.fluid as fluid +import paddle.fluid.layers.ops as ops +from paddle.fluid.initializer import init_on_cpu +from paddle.fluid.layers.learning_rate_scheduler import _decay_step_counter + + +def cosine_decay(learning_rate, step_each_epoch, epochs=120): + """Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + """ + global_step = _decay_step_counter() + + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + decayed_lr = learning_rate * \ + (ops.cos(epoch * (math.pi / epochs)) + 1)/2 + return decayed_lr + + +def cosine_decay_with_warmup(learning_rate, step_each_epoch, epochs=120): + """Applies cosine decay to the learning rate. + lr = 0.05 * (math.cos(epoch * (math.pi / 120)) + 1) + decrease lr for every mini-batch and start with warmup. + """ + global_step = _decay_step_counter() + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") + + warmup_epoch = fluid.layers.fill_constant( + shape=[1], dtype='float32', value=float(5), force_cpu=True) + + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + with fluid.layers.control_flow.Switch() as switch: + with switch.case(epoch < warmup_epoch): + decayed_lr = learning_rate * (global_step / + (step_each_epoch * warmup_epoch)) + fluid.layers.tensor.assign(input=decayed_lr, output=lr) + with switch.default(): + decayed_lr = learning_rate * \ + (ops.cos((global_step - warmup_epoch * step_each_epoch) * (math.pi / (epochs * step_each_epoch))) + 1)/2 + fluid.layers.tensor.assign(input=decayed_lr, output=lr) + return lr + + +def exponential_decay_with_warmup(learning_rate, + step_each_epoch, + decay_epochs, + decay_rate=0.97, + warm_up_epoch=5.0): + """Applies exponential decay to the learning rate. + """ + global_step = _decay_step_counter() + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate") + + warmup_epoch = fluid.layers.fill_constant( + shape=[1], dtype='float32', value=float(warm_up_epoch), force_cpu=True) + + with init_on_cpu(): + epoch = ops.floor(global_step / step_each_epoch) + with fluid.layers.control_flow.Switch() as switch: + with switch.case(epoch < warmup_epoch): + decayed_lr = learning_rate * (global_step / + (step_each_epoch * warmup_epoch)) + fluid.layers.assign(input=decayed_lr, output=lr) + with switch.default(): + div_res = (global_step - warmup_epoch * step_each_epoch + ) / decay_epochs + div_res = ops.floor(div_res) + decayed_lr = learning_rate * (decay_rate**div_res) + fluid.layers.assign(input=decayed_lr, output=lr) + + return lr + + +def lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): + """ Applies linear learning rate warmup for distributed training + Argument learning_rate can be float or a Variable + lr = lr + (warmup_rate * step / warmup_steps) + """ + assert (isinstance(end_lr, float)) + assert (isinstance(start_lr, float)) + linear_step = end_lr - start_lr + with fluid.default_main_program()._lr_schedule_guard(): + lr = fluid.layers.tensor.create_global_var( + shape=[1], + value=0.0, + dtype='float32', + persistable=True, + name="learning_rate_warmup") + + global_step = fluid.layers.learning_rate_scheduler._decay_step_counter( + ) + + with fluid.layers.control_flow.Switch() as switch: + with switch.case(global_step < warmup_steps): + decayed_lr = start_lr + linear_step * (global_step / + warmup_steps) + fluid.layers.tensor.assign(decayed_lr, lr) + with switch.default(): + fluid.layers.tensor.assign(learning_rate, lr) + + return lr + + +class Optimizer(object): + """A class used to represent several optimizer methods + + Attributes: + batch_size: batch size on all devices. + lr: learning rate. + lr_strategy: learning rate decay strategy. + l2_decay: l2_decay parameter. + momentum_rate: momentum rate when using Momentum optimizer. + step_epochs: piecewise decay steps. + num_epochs: number of total epochs. + + total_images: total images. + step: total steps in the an epoch. + + """ + + def __init__(self, args): + self.batch_size = args.batch_size + self.lr = args.lr + self.lr_strategy = args.lr_strategy + self.l2_decay = args.l2_decay + self.momentum_rate = args.momentum_rate + self.step_epochs = args.step_epochs + self.num_epochs = args.num_epochs + self.warm_up_epochs = args.warm_up_epochs + self.decay_epochs = args.decay_epochs + self.decay_rate = args.decay_rate + self.total_images = args.total_images + + self.step = int(math.ceil(float(self.total_images) / self.batch_size)) + + def piecewise_decay(self): + """piecewise decay with Momentum optimizer + + Returns: + a piecewise_decay optimizer + """ + bd = [self.step * e for e in self.step_epochs] + lr = [self.lr * (0.1**i) for i in range(len(bd) + 1)] + learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr) + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=self.momentum_rate, + regularization=fluid.regularizer.L2Decay(self.l2_decay)) + return optimizer + + def cosine_decay(self): + """cosine decay with Momentum optimizer + + Returns: + a cosine_decay optimizer + """ + + learning_rate = fluid.layers.cosine_decay( + learning_rate=self.lr, + step_each_epoch=self.step, + epochs=self.num_epochs) + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=self.momentum_rate, + regularization=fluid.regularizer.L2Decay(self.l2_decay)) + return optimizer + + def cosine_decay_warmup(self): + """cosine decay with warmup + + Returns: + a cosine_decay_with_warmup optimizer + """ + + learning_rate = cosine_decay_with_warmup( + learning_rate=self.lr, + step_each_epoch=self.step, + epochs=self.num_epochs) + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=self.momentum_rate, + regularization=fluid.regularizer.L2Decay(self.l2_decay)) + return optimizer + + def exponential_decay_warmup(self): + """exponential decay with warmup + + Returns: + a exponential_decay_with_warmup optimizer + """ + + learning_rate = exponential_decay_with_warmup( + learning_rate=self.lr, + step_each_epoch=self.step, + decay_epochs=self.step * self.decay_epochs, + decay_rate=self.decay_rate, + warm_up_epoch=self.warm_up_epochs) + optimizer = fluid.optimizer.RMSProp( + learning_rate=learning_rate, + regularization=fluid.regularizer.L2Decay(self.l2_decay), + momentum=self.momentum_rate, + rho=0.9, + epsilon=0.001) + return optimizer + + def linear_decay(self): + """linear decay with Momentum optimizer + + Returns: + a linear_decay optimizer + """ + + end_lr = 0 + learning_rate = fluid.layers.polynomial_decay( + self.lr, self.step, end_lr, power=1) + optimizer = fluid.optimizer.Momentum( + learning_rate=learning_rate, + momentum=self.momentum_rate, + regularization=fluid.regularizer.L2Decay(self.l2_decay)) + + return optimizer + + def adam_decay(self): + """Adam optimizer + + Returns: + an adam_decay optimizer + """ + + return fluid.optimizer.Adam(learning_rate=self.lr) + + def cosine_decay_RMSProp(self): + """cosine decay with RMSProp optimizer + + Returns: + an cosine_decay_RMSProp optimizer + """ + + learning_rate = fluid.layers.cosine_decay( + learning_rate=self.lr, + step_each_epoch=self.step, + epochs=self.num_epochs) + optimizer = fluid.optimizer.RMSProp( + learning_rate=learning_rate, + momentum=self.momentum_rate, + regularization=fluid.regularizer.L2Decay(self.l2_decay), + # Apply epsilon=1 on ImageNet dataset. + epsilon=1) + return optimizer + + def default_decay(self): + """default decay + + Returns: + default decay optimizer + """ + + optimizer = fluid.optimizer.Momentum( + learning_rate=self.lr, + momentum=self.momentum_rate, + regularization=fluid.regularizer.L2Decay(self.l2_decay)) + return optimizer + + +def create_optimizer(args): + Opt = Optimizer(args) + optimizer = getattr(Opt, args.lr_strategy)() + + return optimizer diff --git a/paddleslim/common/controller_server.py b/paddleslim/common/controller_server.py index bf3ee3ab2e27c468c929013be6954f4042e53537..eb37fe914e99ced520b3ab8faf4227466e83ae3c 100644 --- a/paddleslim/common/controller_server.py +++ b/paddleslim/common/controller_server.py @@ -107,6 +107,8 @@ class ControllerServer(object): _logger.debug("send message to {}: [{}]".format(addr, tokens)) conn.close() + except Exception, err: + _logger.error(err) finally: self._socket_server.close() self.close() diff --git a/paddleslim/common/sa_controller.py b/paddleslim/common/sa_controller.py index 9a36da93c848821ac8b9d8992b4b4d5d6bf44994..f3dfa4b9c0101f068be7c925cdb0f738f7b00d4c 100644 --- a/paddleslim/common/sa_controller.py +++ b/paddleslim/common/sa_controller.py @@ -75,7 +75,7 @@ class SAController(EvolutionaryController): iter = int(iter) if iter > self._iter: self._iter = iter - temperature = self._init_temperature * self._reduce_rate**self._iter + temperature = self._init_temperature * self._reduce_rate**self._iter if (reward > self._reward) or (np.random.random() <= math.exp( (reward - self._reward) / temperature)): self._reward = reward @@ -98,7 +98,7 @@ class SAController(EvolutionaryController): new_tokens = tokens[:] index = int(len(self._range_table[0]) * np.random.random()) new_tokens[index] = np.random.randint(self._range_table[0][index], - self._range_table[1][index] + 1) + self._range_table[1][index]) _logger.debug("change index[{}] from {} to {}".format(index, tokens[ index], new_tokens[index])) if self._constrain_func is None or self._max_try_times is None: diff --git a/paddleslim/core/graph_wrapper.py b/paddleslim/core/graph_wrapper.py index 72de894a2e4345c32e7a4eee2f35249b77c2f467..dc01846a10feb8bf212f9e35b9cd585df47ba739 100644 --- a/paddleslim/core/graph_wrapper.py +++ b/paddleslim/core/graph_wrapper.py @@ -54,6 +54,9 @@ class VarWrapper(object): """ return self._var.name + def __repr__(self): + return self._var.name + def shape(self): """ Get the shape of the varibale. @@ -131,6 +134,11 @@ class OpWrapper(object): """ return self._op.type + def __repr__(self): + return "op[id: {}, type: {}; inputs: {}]".format(self.idx(), + self.type(), + self.all_inputs()) + def is_bwd_op(self): """ Whether this operator is backward op. diff --git a/paddleslim/nas/sa_nas.py b/paddleslim/nas/sa_nas.py index f57caaa6beb6fec59b618a689b44652f0cf259fc..00decbfd1ae38dfa3fedf3234665ca740674d603 100644 --- a/paddleslim/nas/sa_nas.py +++ b/paddleslim/nas/sa_nas.py @@ -60,16 +60,17 @@ class SANAS(object): self._init_temperature = init_temperature self._is_server = is_server self._configs = configs - self._keys = hashlib.md5(str(self._configs)).hexdigest() + self._key = hashlib.md5(str(self._configs)).hexdigest() server_ip, server_port = server_addr if server_ip == None or server_ip == "": server_ip = self._get_host_ip() + factory = SearchSpaceFactory() + self._search_space = factory.get_search_space(configs) + # create controller server if self._is_server: - factory = SearchSpaceFactory() - self._search_space = factory.get_search_space(configs) init_tokens = self._search_space.init_tokens() range_table = self._search_space.range_table() range_table = (len(range_table) * [0], range_table) @@ -90,6 +91,7 @@ class SANAS(object): search_steps=search_steps, key=self._key) self._controller_server.start() + server_port = self._controller_server.port() self._controller_client = ControllerClient( server_ip, server_port, key=self._key) @@ -99,6 +101,9 @@ class SANAS(object): def _get_host_ip(self): return socket.gethostbyname(socket.gethostname()) + def tokens2arch(self, tokens): + return self._search_space.token2arch(self.tokens) + def next_archs(self): """ Get next network architectures. diff --git a/paddleslim/nas/search_space/combine_search_space.py b/paddleslim/nas/search_space/combine_search_space.py index 667720a9110aa92e096a4f8fa30bb3e4b3e3cecb..17ebbd3939798ad0e2a7d3fd763bb9427f6e13f0 100644 --- a/paddleslim/nas/search_space/combine_search_space.py +++ b/paddleslim/nas/search_space/combine_search_space.py @@ -39,6 +39,7 @@ class CombineSearchSpace(object): for config_list in config_lists: key, config = config_list self.spaces.append(self._get_single_search_space(key, config)) + self.init_tokens() def _get_single_search_space(self, key, config): """ @@ -51,9 +52,11 @@ class CombineSearchSpace(object): model space(class) """ cls = SEARCHSPACE.get(key) - space = cls(config['input_size'], config['output_size'], - config['block_num'], config['block_mask']) - + block_mask = config['block_mask'] if 'block_mask' in config else None + space = cls(config['input_size'], + config['output_size'], + config['block_num'], + block_mask=block_mask) return space def init_tokens(self): diff --git a/paddleslim/nas/search_space/mobilenetv1.py b/paddleslim/nas/search_space/mobilenetv1.py index 8b3277d2cb1b472ccd5e27407e3099b28e64f42b..3976d21df1e3ad2c5ac344dab59ad32adeaedb79 100644 --- a/paddleslim/nas/search_space/mobilenetv1.py +++ b/paddleslim/nas/search_space/mobilenetv1.py @@ -32,10 +32,12 @@ class MobileNetV1Space(SearchSpaceBase): input_size, output_size, block_num, + block_mask, scale=1.0, class_dim=1000): super(MobileNetV1Space, self).__init__(input_size, output_size, - block_num) + block_num, block_mask) + assert self.block_mask == None, 'MobileNetV1Space will use origin MobileNetV1 as seach space, so use input_size, output_size and block_num to search' self.scale = scale self.class_dim = class_dim # self.head_num means the channel of first convolution diff --git a/paddleslim/nas/search_space/mobilenetv2.py b/paddleslim/nas/search_space/mobilenetv2.py index e974a676a70546e19aa4649679393031634e7822..36231912715a29808d55158881ab3e918260f8b5 100644 --- a/paddleslim/nas/search_space/mobilenetv2.py +++ b/paddleslim/nas/search_space/mobilenetv2.py @@ -113,40 +113,69 @@ class MobileNetV2Space(SearchSpaceBase): if tokens is None: tokens = self.init_tokens() - print(tokens) - bottleneck_params_list = [] + self.bottleneck_params_list = [] if self.block_num >= 1: - bottleneck_params_list.append( + self.bottleneck_params_list.append( (1, self.head_num[tokens[0]], 1, 1, 3)) if self.block_num >= 2: - bottleneck_params_list.append( + self.bottleneck_params_list.append( (self.multiply[tokens[1]], self.filter_num1[tokens[2]], self.repeat[tokens[3]], 2, self.k_size[tokens[4]])) if self.block_num >= 3: - bottleneck_params_list.append( + self.bottleneck_params_list.append( (self.multiply[tokens[5]], self.filter_num1[tokens[6]], self.repeat[tokens[7]], 2, self.k_size[tokens[8]])) if self.block_num >= 4: - bottleneck_params_list.append( + self.bottleneck_params_list.append( (self.multiply[tokens[9]], self.filter_num2[tokens[10]], self.repeat[tokens[11]], 2, self.k_size[tokens[12]])) if self.block_num >= 5: - bottleneck_params_list.append( + self.bottleneck_params_list.append( (self.multiply[tokens[13]], self.filter_num3[tokens[14]], self.repeat[tokens[15]], 2, self.k_size[tokens[16]])) - bottleneck_params_list.append( + self.bottleneck_params_list.append( (self.multiply[tokens[17]], self.filter_num4[tokens[18]], self.repeat[tokens[19]], 1, self.k_size[tokens[20]])) if self.block_num >= 6: - bottleneck_params_list.append( + self.bottleneck_params_list.append( (self.multiply[tokens[21]], self.filter_num5[tokens[22]], self.repeat[tokens[23]], 2, self.k_size[tokens[24]])) - bottleneck_params_list.append( + self.bottleneck_params_list.append( (self.multiply[tokens[25]], self.filter_num6[tokens[26]], self.repeat[tokens[27]], 1, self.k_size[tokens[28]])) - def net_arch(input): + def _modify_bottle_params(output_stride=None): + if output_stride is not None and output_stride % 2 != 0: + raise Exception("output stride must to be even number") + if output_stride is None: + return + else: + stride = 2 + for i, layer_setting in enumerate(self.bottleneck_params_list): + t, c, n, s, ks = layer_setting + stride = stride * s + if stride > output_stride: + s = 1 + self.bottleneck_params_list[i] = (t, c, n, s, ks) + + def net_arch(input, + end_points=None, + decode_points=None, + output_stride=None): + _modify_bottle_params(output_stride) + + decode_ends = dict() + + def check_points(count, points): + if points is None: + return False + else: + if isinstance(points, list): + return (True if count in points else False) + else: + return (True if count == points else False) + #conv1 # all padding is 'SAME' in the conv2d, can compute the actual padding automatic. input = conv_bn_layer( @@ -157,14 +186,21 @@ class MobileNetV2Space(SearchSpaceBase): padding='SAME', act='relu6', name='mobilenetv2_conv1_1') + layer_count = 1 + if check_points(layer_count, decode_points): + decode_ends[layer_count] = input + + if check_points(layer_count, end_points): + return input, decode_ends # bottleneck sequences i = 1 in_c = int(32 * self.scale) - for layer_setting in bottleneck_params_list: + for layer_setting in self.bottleneck_params_list: t, c, n, s, k = layer_setting i += 1 - input = self._invresi_blocks( + #print(input) + input, depthwise_output = self._invresi_blocks( input=input, in_c=in_c, t=t, @@ -174,6 +210,33 @@ class MobileNetV2Space(SearchSpaceBase): k=k, name='mobilenetv2_conv' + str(i)) in_c = int(c * self.scale) + layer_count += 1 + + ### decode_points and end_points means block num + if check_points(layer_count, decode_points): + decode_ends[layer_count] = depthwise_output + + if check_points(layer_count, end_points): + return input, decode_ends + + # last conv + input = conv_bn_layer( + input=input, + num_filters=int(1280 * self.scale) + if self.scale > 1.0 else 1280, + filter_size=1, + stride=1, + padding='SAME', + act='relu6', + name='mobilenetv2_conv' + str(i + 1)) + + input = fluid.layers.pool2d( + input=input, + pool_size=7, + pool_stride=1, + pool_type='avg', + global_pooling=True, + name='mobilenetv2_last_pool') # if output_size is 1, add fc layer in the end if self.output_size == 1: @@ -248,6 +311,8 @@ class MobileNetV2Space(SearchSpaceBase): name=name + '_dwise', use_cudnn=False) + depthwise_output = bottleneck_conv + linear_out = conv_bn_layer( input=bottleneck_conv, num_filters=num_filters, @@ -260,7 +325,7 @@ class MobileNetV2Space(SearchSpaceBase): out = linear_out if ifshortcut: out = self._shortcut(input=input, data_residual=out) - return out + return out, depthwise_output def _invresi_blocks(self, input, in_c, t, c, n, s, k, name=None): """Build inverted residual blocks. @@ -276,7 +341,7 @@ class MobileNetV2Space(SearchSpaceBase): Returns: Variable, layers output. """ - first_block = self._inverted_residual_unit( + first_block, depthwise_output = self._inverted_residual_unit( input=input, num_in_filter=in_c, num_filters=c, @@ -290,7 +355,7 @@ class MobileNetV2Space(SearchSpaceBase): last_c = c for i in range(1, n): - last_residual_block = self._inverted_residual_unit( + last_residual_block, depthwise_output = self._inverted_residual_unit( input=last_residual_block, num_in_filter=last_c, num_filters=c, @@ -299,4 +364,4 @@ class MobileNetV2Space(SearchSpaceBase): filter_size=k, expansion_factor=t, name=name + '_' + str(i + 1)) - return last_residual_block + return last_residual_block, depthwise_output diff --git a/paddleslim/nas/search_space/search_space_base.py b/paddleslim/nas/search_space/search_space_base.py index 6a83f86005a5fb2408f7f85f40dff8a9e5cba819..b8f5d9b89bd2a64e566a5b20280dd27048a4028b 100644 --- a/paddleslim/nas/search_space/search_space_base.py +++ b/paddleslim/nas/search_space/search_space_base.py @@ -19,11 +19,19 @@ class SearchSpaceBase(object): """Controller for Neural Architecture Search. """ - def __init__(self, input_size, output_size, block_num, block_mask, *argss): + def __init__(self, input_size, output_size, block_num, block_mask, *args): + """init model config + """ self.input_size = input_size self.output_size = output_size self.block_num = block_num self.block_mask = block_mask + if self.block_mask is not None: + assert isinstance(self.block_mask, + list), 'Block_mask must be a list.' + print( + "If block_mask is NOT None, we will use block_mask as major configs!" + ) def init_tokens(self): """Get init tokens in search space. diff --git a/paddleslim/prune/pruner.py b/paddleslim/prune/pruner.py index 0fdde525a793b90df63f3245ac5215365dd7ccf4..e2b6a7e1d28078abef97c5fa53b215b098f18cca 100644 --- a/paddleslim/prune/pruner.py +++ b/paddleslim/prune/pruner.py @@ -528,33 +528,41 @@ class Pruner(): Returns: list: A list of operators. """ + _logger.debug("######################search: {}######################". + format(op_node)) visited = [op_node.idx()] stack = [] brothers = [] for op in graph.next_ops(op_node): - if (op.type() != 'conv2d') and (op.type() != 'fc') and ( - not op.is_bwd_op()): + if ("conv2d" not in op.type()) and (op.type() != 'fc') and ( + not op.is_bwd_op()) and (not op.is_opt_op()): stack.append(op) visited.append(op.idx()) while len(stack) > 0: top_op = stack.pop() - if top_op.type().startswith("elementwise_"): - for parent in graph.pre_ops(top_op): - if parent.idx() not in visited and ( - not parent.is_bwd_op()): - if ((parent.type() == 'conv2d') or - (parent.type() == 'fc')): - brothers.append(parent) - else: - stack.append(parent) - visited.append(parent.idx()) + for parent in graph.pre_ops(top_op): + if parent.idx() not in visited and ( + not parent.is_bwd_op()) and (not parent.is_opt_op()): + _logger.debug("----------go back from {} to {}----------". + format(top_op, parent)) + if (('conv2d' in parent.type()) or + (parent.type() == 'fc')): + brothers.append(parent) + else: + stack.append(parent) + visited.append(parent.idx()) for child in graph.next_ops(top_op): - if (child.type() != 'conv2d') and (child.type() != 'fc') and ( + if ('conv2d' not in child.type() + ) and (child.type() != 'fc') and ( child.idx() not in visited) and ( - not child.is_bwd_op()): + not child.is_bwd_op()) and (not child.is_opt_op()): stack.append(child) visited.append(child.idx()) + _logger.debug("brothers: {}".format(brothers)) + _logger.debug( + "######################Finish search######################".format( + op_node)) return brothers def _cal_pruned_idx(self, name, param, ratio, axis): diff --git a/tests/test_prune.py b/tests/test_prune.py index 93609367351618ce375f164a1dca284e85369e4c..3fdaa867e350af876648871f83fe70cc83b548b6 100644 --- a/tests/test_prune.py +++ b/tests/test_prune.py @@ -15,7 +15,7 @@ import sys sys.path.append("../") import unittest import paddle.fluid as fluid -from prune import Pruner +from paddleslim.prune import Pruner from layers import conv_bn_layer diff --git a/tests/test_sa_nas.py b/tests/test_sa_nas.py index 5666e1410a820c09bc10fa0b10d282434c7837fe..a4203a85a898632ac2102eb61ab7dd7b475e73ef 100644 --- a/tests/test_sa_nas.py +++ b/tests/test_sa_nas.py @@ -40,7 +40,11 @@ class TestSANAS(unittest.TestCase): base_flops = flops(main_program) search_steps = 3 - sa_nas = SANAS(configs, search_steps=search_steps, is_server=True) + sa_nas = SANAS( + configs, + search_steps=search_steps, + server_addr=("", 0), + is_server=True) for i in range(search_steps): archs = sa_nas.next_archs()