diff --git a/demo/nas/sa_nas_mobilenetv2.py b/demo/nas/sa_nas_mobilenetv2.py index 6eb557d16ded2f09f21c5580ba6e9756311691be..ce918f77d6e13d098fc13ae7cb574a2f51b766e8 100644 --- a/demo/nas/sa_nas_mobilenetv2.py +++ b/demo/nas/sa_nas_mobilenetv2.py @@ -1,315 +1,1028 @@ -import sys -sys.path.append('..') -import numpy as np -import argparse -import ast -import time -import argparse -import ast -import logging -import paddle -import paddle.nn as nn -import paddle.static as static -import paddle.nn.functional as F -import paddle.vision.transforms as T -from paddle import ParamAttr -from paddleslim.analysis import flops -from paddleslim.nas import SANAS -from paddleslim.common import get_logger -from optimizer import create_optimizer -import imagenet_reader - -_logger = get_logger(__name__, level=logging.INFO) - - -def build_program(main_program, - startup_program, - image_shape, - dataset, - archs, - args, - places, - is_test=False): - with static.program_guard(main_program, startup_program): - with paddle.utils.unique_name.guard(): - data_shape = [None] + image_shape - data = static.data(name='data', shape=data_shape, dtype='float32') - label = static.data(name='label', shape=[None, 1], dtype='int64') - if args.data == 'cifar10': - paddle.assign(paddle.reshape(label, [-1, 1]), label) - if is_test: - data_loader = paddle.io.DataLoader( - dataset, - places=places, - feed_list=[data, label], - drop_last=False, - batch_size=args.batch_size, - return_list=False, - shuffle=False) - else: - data_loader = paddle.io.DataLoader( - dataset, - places=places, - feed_list=[data, label], - drop_last=True, - batch_size=args.batch_size, - return_list=False, - shuffle=True, - use_shared_memory=True, - num_workers=4) - output = archs(data) - output = static.nn.fc(x=output, size=args.class_dim) - - softmax_out = F.softmax(output) - cost = F.cross_entropy(softmax_out, label=label) - avg_cost = paddle.mean(cost) - acc_top1 = paddle.metric.accuracy( - input=softmax_out, label=label, k=1) - acc_top5 = paddle.metric.accuracy( - input=softmax_out, label=label, k=5) - - if is_test == False: - optimizer = create_optimizer(args) - optimizer.minimize(avg_cost) - return data_loader, avg_cost, acc_top1, acc_top5 - - -def search_mobilenetv2(config, args, image_size, is_server=True): - image_shape = [3, image_size, image_size] - if args.data == 'cifar10': - transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) - train_dataset = paddle.vision.datasets.Cifar10( - mode='train', transform=transform, backend='cv2') - val_dataset = paddle.vision.datasets.Cifar10( - mode='test', transform=transform, backend='cv2') - - elif args.data == 'imagenet': - train_dataset = imagenet_reader.ImageNetDataset(mode='train') - val_dataset = imagenet_reader.ImageNetDataset(mode='val') - - places = static.cuda_places() if args.use_gpu else static.cpu_places() - place = places[0] - if is_server: - ### start a server and a client - sa_nas = SANAS( - config, - server_addr=(args.server_address, args.port), - search_steps=args.search_steps, - is_server=True) - else: - ### start a client - sa_nas = SANAS( - config, - server_addr=(args.server_address, args.port), - search_steps=args.search_steps, - is_server=False) - - for step in range(args.search_steps): - archs = sa_nas.next_archs()[0] - - train_program = static.Program() - test_program = static.Program() - startup_program = static.Program() - train_loader, avg_cost, acc_top1, acc_top5 = build_program( - train_program, startup_program, image_shape, train_dataset, archs, - args, places) - - current_flops = flops(train_program) - print('step: {}, current_flops: {}'.format(step, current_flops)) - if current_flops > int(321208544): - continue - - test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( - test_program, - startup_program, - image_shape, - val_dataset, - archs, - args, - place, - is_test=True) - test_program = test_program.clone(for_test=True) - - exe = static.Executor(place) - exe.run(startup_program) - - build_strategy = static.BuildStrategy() - train_compiled_program = static.CompiledProgram( - train_program).with_data_parallel( - loss_name=avg_cost.name, build_strategy=build_strategy) - for epoch_id in range(args.retain_epoch): - for batch_id, data in enumerate(train_loader()): - fetches = [avg_cost.name] - s_time = time.time() - outs = exe.run(train_compiled_program, - feed=data, - fetch_list=fetches)[0] - batch_time = time.time() - s_time - if batch_id % 10 == 0: - _logger.info( - 'TRAIN: steps: {}, epoch: {}, batch: {}, cost: {}, batch_time: {}ms'. - format(step, epoch_id, batch_id, outs[0], batch_time)) - - reward = [] - for batch_id, data in enumerate(test_loader()): - test_fetches = [ - test_avg_cost.name, test_acc_top1.name, test_acc_top5.name - ] - batch_reward = exe.run(test_program, - feed=data, - fetch_list=test_fetches) - reward_avg = np.mean(np.array(batch_reward), axis=1) - reward.append(reward_avg) - - _logger.info( - 'TEST: step: {}, batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'. - format(step, batch_id, batch_reward[0], batch_reward[1], - batch_reward[2])) - - finally_reward = np.mean(np.array(reward), axis=0) - _logger.info( - 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( - finally_reward[0], finally_reward[1], finally_reward[2])) - - sa_nas.reward(float(finally_reward[1])) - - -def test_search_result(tokens, image_size, args, config): - places = static.cuda_places() if args.use_gpu else static.cpu_places() - place = places[0] - - sa_nas = SANAS( - config, - server_addr=(args.server_address, args.port), - search_steps=args.search_steps, - is_server=True) - - image_shape = [3, image_size, image_size] - if args.data == 'cifar10': - transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) - train_dataset = paddle.vision.datasets.Cifar10( - mode='train', transform=transform, backend='cv2') - val_dataset = paddle.vision.datasets.Cifar10( - mode='test', transform=transform, backend='cv2') - - elif args.data == 'imagenet': - train_dataset = imagenet_reader.ImageNetDataset(mode='train') - val_dataset = imagenet_reader.ImageNetDataset(mode='val') - - archs = sa_nas.tokens2arch(tokens)[0] - - train_program = static.Program() - test_program = static.Program() - startup_program = static.Program() - train_loader, avg_cost, acc_top1, acc_top5 = build_program( - train_program, startup_program, image_shape, train_dataset, archs, args, - places) - - current_flops = flops(train_program) - print('current_flops: {}'.format(current_flops)) - test_loader, test_avg_cost, test_acc_top1, test_acc_top5 = build_program( - test_program, - startup_program, - image_shape, - val_dataset, - archs, - args, - place, - is_test=True) - - test_program = test_program.clone(for_test=True) - - exe = static.Executor(place) - exe.run(startup_program) - - build_strategy = static.BuildStrategy() - train_compiled_program = static.CompiledProgram( - train_program).with_data_parallel( - loss_name=avg_cost.name, build_strategy=build_strategy) - for epoch_id in range(args.retain_epoch): - for batch_id, data in enumerate(train_loader()): - fetches = [avg_cost.name] - s_time = time.time() - outs = exe.run(train_compiled_program, - feed=data, - fetch_list=fetches)[0] - batch_time = time.time() - s_time - if batch_id % 10 == 0: - _logger.info( - 'TRAIN: epoch: {}, batch: {}, cost: {}, batch_time: {}ms'. - format(epoch_id, batch_id, outs[0], batch_time)) - - reward = [] - for batch_id, data in enumerate(test_loader()): - test_fetches = [ - test_avg_cost.name, test_acc_top1.name, test_acc_top5.name - ] - batch_reward = exe.run(test_program, - feed=data, - fetch_list=test_fetches) - reward_avg = np.mean(np.array(batch_reward), axis=1) - reward.append(reward_avg) - - _logger.info( - 'TEST: batch: {}, avg_cost: {}, acc_top1: {}, acc_top5: {}'. - format(batch_id, batch_reward[0], batch_reward[1], batch_reward[ - 2])) - - finally_reward = np.mean(np.array(reward), axis=0) - _logger.info( - 'FINAL TEST: avg_cost: {}, acc_top1: {}, acc_top5: {}'.format( - finally_reward[0], finally_reward[1], finally_reward[2])) - - -if __name__ == '__main__': - - parser = argparse.ArgumentParser( - description='SA NAS MobileNetV2 cifar10 argparase') - parser.add_argument( - '--use_gpu', - type=ast.literal_eval, - default=True, - help='Whether to use GPU in train/test model.') - parser.add_argument( - '--batch_size', type=int, default=256, help='batch size.') - parser.add_argument( - '--class_dim', type=int, default=10, help='classify number.') - parser.add_argument( - '--data', - type=str, - default='cifar10', - choices=['cifar10', 'imagenet'], - help='server address.') - parser.add_argument( - '--is_server', - type=ast.literal_eval, - default=True, - help='Whether to start a server.') - parser.add_argument( - '--search_steps', - type=int, - default=100, - help='controller server number.') - parser.add_argument( - '--server_address', type=str, default="", help='server ip.') - parser.add_argument('--port', type=int, default=8881, help='server port') - parser.add_argument( - '--retain_epoch', type=int, default=5, help='epoch for each token.') - parser.add_argument('--lr', type=float, default=0.1, help='learning rate.') - args = parser.parse_args() - print(args) - - if args.data == 'cifar10': - image_size = 32 - block_num = 3 - elif args.data == 'imagenet': - image_size = 224 - block_num = 6 - else: - raise NotImplementedError( - 'data must in [cifar10, imagenet], but received: {}'.format( - args.data)) - - config = [('MobileNetV2Space')] - paddle.enable_static() - search_mobilenetv2(config, args, image_size, is_server=args.is_server) +#!/usr/bin/env bash +################## +#bash slim_ci_demo_all_case.sh $5 $6; + +print_info(){ +if [ $1 -ne 0 ];then + mv ${log_path}/$2 ${log_path}/FAIL_$2.log + echo -e "\033[31m ${log_path}/FAIL_$2 \033[0m" + echo "fail log as follow" + cat ${log_path}/FAIL_$2.log +else + mv ${log_path}/$2 ${log_path}/SUCCESS_$2.log + echo -e "\033[32m ${log_path}/SUCCESS_$2 \033[0m" + cat ${log_path}/SUCCESS_$2.log +fi +} + +catchException() { + echo $1 failed due to exception >> FAIL_Exception.log +} + +cudaid1=$1; +cudaid2=$2; +echo "cudaid1,cudaid2", ${cudaid1}, ${cudaid2} +export CUDA_VISIBLE_DEVICES=${cudaid1} +#分布式log输出方式 +export PADDLE_LOG_LEVEL=debug + +export FLAGS_fraction_of_gpu_memory_to_use=0.98 +# data PaddleSlim/demo/data/ILSVRC2012 +cd ${slim_dir}/demo +if [ -d "data" ];then + rm -rf data +fi +wget -q https://sys-p0.bj.bcebos.com/slim_ci/ILSVRC2012_data_demo.tar.gz --no-check-certificate +tar xf ILSVRC2012_data_demo.tar.gz +mv ILSVRC2012_data_demo data +# download pretrain model +root_url="http://paddle-imagenet-models-name.bj.bcebos.com" +pre_models="MobileNetV1 MobileNetV2 MobileNetV3_large_x1_0_ssld ResNet101_vd MobileNetV2 ResNet34 ResNet50 ResNet50_vd" +if [ -d "pretrain" ];then + rm -rf pretrain +fi +mkdir pretrain && cd pretrain +for model in ${pre_models} +do + if [ ! -f ${model} ]; then + wget -q ${root_url}/${model}_pretrained.tar + tar xf ${model}_pretrained.tar + fi +done + +# 1 dist +demo_distillation_01(){ +cd ${slim_dir}/demo/distillation || catchException demo_distillation +if [ -d "output" ];then + rm -rf output +fi +export CUDA_VISIBLE_DEVICES=${cudaid1} +python distill.py --num_epochs 1 --save_inference True >${log_path}/demo_distillation_ResNet50_vd_T 2>&1 +print_info $? demo_distillation_ResNet50_vd_T + +} + +demo_distillation_02(){ +cd ${slim_dir}/demo/distillation || catchException demo_distillation +if [ -d "output" ];then + rm -rf output +fi + +export CUDA_VISIBLE_DEVICES=${cudaid1} +python distill.py --num_epochs 1 --batch_size 64 --save_inference True \ +--model ResNet50 --teacher_model ResNet101_vd \ +--teacher_pretrained_model ../pretrain/ResNet101_vd_pretrained >${log_path}/demo_distillation_ResNet101_vd_ResNet50_T 2>&1 +print_info $? demo_distillation_ResNet101_vd_ResNet50_T + +python distill.py --num_epochs 1 --batch_size 64 --save_inference True \ +--model MobileNetV2_x0_25 --teacher_model MobileNetV2 \ +--teacher_pretrained_model ../pretrain/MobileNetV2_pretrained >${log_path}/demo_distillation_MobileNetV2_MobileNetV2_x0_25_T 2>&1 +print_info $? demo_distillation_MobileNetV2_MobileNetV2_x0_25_T +} + +demo_deep_mutual_learning(){ +cd ${slim_dir}/demo/deep_mutual_learning || catchException demo_deep_mutual_learning +export CUDA_VISIBLE_DEVICES=${cudaid1} +model=dml_mv1_mv1_gpu1 +CUDA_VISIBLE_DEVICES=${cudaid1} +python dml_train.py --epochs 1 >${log_path}/${model} 2>&1 +print_info $? ${model} +model=dml_mv1_res50_gpu1 +CUDA_VISIBLE_DEVICES=${cudaid1} +python dml_train.py --models='mobilenet-resnet50' --batch_size 128 --epochs 1 >${log_path}/${model} 2>&1 +print_info $? ${model} +} + +all_distillation(){ # 大数据 5个模型 + demo_distillation_01 # 3 + #demo_distillation_02 + #demo_deep_mutual_learning # 2 +} +# 2.1 quant/quant_aware 使用小数据集即可 +demo_quant_quant_aware(){ +cd ${slim_dir}/demo/quant/quant_aware || catchException demo_quant_quant_aware +if [ -d "output" ];then + rm -rf output +fi +export CUDA_VISIBLE_DEVICES=${cudaid1} +# 2.1版本时默认BS=256会报显存不足,故暂时修改成128 +python train.py --model MobileNet --pretrained_model ../../pretrain/MobileNetV1_pretrained \ +--checkpoint_dir ./output/mobilenetv1 --num_epochs 1 --batch_size 128 >${log_path}/demo_quant_quant_aware_v1 2>&1 +print_info $? demo_quant_quant_aware_v1 + +export CUDA_VISIBLE_DEVICES=${cudaid1} +python train.py --model ResNet34 \ +--pretrained_model ../../pretrain/ResNet34_pretrained \ +--checkpoint_dir ./output/ResNet34 --num_epochs 1 >${log_path}/demo_quant_quant_aware_ResNet34_T 2>&1 +print_info $? demo_quant_quant_aware_ResNet34_T +} +# 2.2 quant/quant_embedding +demo_quant_quant_embedding(){ +cd ${slim_dir}/demo/quant/quant_embedding || catchException demo_quant_quant_embedding +export CUDA_VISIBLE_DEVICES=${cudaid1} +# 先使用word2vec的demo数据进行一轮训练,比较量化前infer结果同量化后infer结果different +if [ -d "data" ];then + rm -rf data +fi +wget -q https://sys-p0.bj.bcebos.com/slim_ci/word_2evc_demo_data.tar.gz --no-check-certificate +tar xf word_2evc_demo_data.tar.gz +mv word_2evc_demo_data data +if [ -d "v1_cpu5_b100_lr1dir" ];then + rm -rf v1_cpu5_b100_lr1dir +fi +OPENBLAS_NUM_THREADS=1 CPU_NUM=5 python train.py --train_data_dir data/convert_text8 \ +--dict_path data/test_build_dict --num_passes 1 --batch_size 100 --model_output_dir v1_cpu5_b100_lr1dir \ + --base_lr 1.0 --print_batch 1000 --with_speed --is_sparse >${log_path}/quant_em_word2vec_T 2>&1 +print_info $? quant_em_word2vec_T +# 量化前infer +python infer.py --infer_epoch --test_dir data/test_mid_dir \ +--dict_path data/test_build_dict_word_to_id_ \ +--batch_size 20000 --model_dir v1_cpu5_b100_lr1dir/ \ +--start_index 0 --last_index 0 >${log_path}/quant_em_infer1 2>&1 +print_info $? quant_em_infer1 +# 量化后infer +python infer.py --infer_epoch --test_dir data/test_mid_dir \ +--dict_path data/test_build_dict_word_to_id_ \ +--batch_size 20000 --model_dir v1_cpu5_b100_lr1dir/ --start_index 0 \ +--last_index 0 --emb_quant True >${log_path}/quant_em_infer2 2>&1 +print_info $? quant_em_infer2 +} +# 2.3 quan_post # 小数据集 +demo_quant_quant_post(){ +# 20210425 新增4种离线量化方法 +cd ${slim_dir}/demo/quant/quant_post || catchException demo_quant_quant_post +export CUDA_VISIBLE_DEVICES=${cudaid1} +# 1 导出模型 +python export_model.py --model "MobileNet" --pretrained_model ../../pretrain/MobileNetV1_pretrained \ +--data imagenet >${log_path}/st_quant_post_v1_export 2>&1 +print_info $? st_quant_post_v1_export +# 量化前eval +python eval.py --model_path ./inference_model/MobileNet --model_name model \ +--params_name weights >${log_path}/st_quant_post_v1_eval1 2>&1 +print_info $? st_quant_post_v1_eval1 + +# 3 离线量化 +# 4 量化后eval +for algo in hist avg mse +do +## 不带bc 离线量化 +echo "quant_post train no bc " ${algo} +python quant_post.py --model_path ./inference_model/MobileNet \ +--save_path ./quant_model/${algo}/MobileNet \ +--model_filename model --params_filename weights --algo ${algo} >${log_path}/st_quant_post_v1_T_${algo} 2>&1 +print_info $? st_quant_post_v1_T_${algo} +# 量化后eval +echo "quant_post eval no bc " ${algo} +python eval.py --model_path ./quant_model/${algo}/MobileNet --model_name __model__ \ +--params_name __params__ > ${log_path}/st_quant_post_${algo}_eval2 2>&1 +print_info $? st_quant_post_${algo}_eval2 + +# 带bc参数的 离线量化 +echo "quant_post train bc " ${algo} +python quant_post.py --model_path ./inference_model/MobileNet \ +--save_path ./quant_model/${algo}_bc/MobileNet \ +--model_filename model --params_filename weights \ +--algo ${algo} --bias_correction True >${log_path}/st_quant_post_T_${algo}_bc 2>&1 +print_info $? st_quant_post_T_${algo}_bc + +# 量化后eval +echo "quant_post eval bc " ${algo} +python eval.py --model_path ./quant_model/${algo}_bc/MobileNet --model_name __model__ \ +--params_name __params__ > ${log_path}/st_quant_post_${algo}_bc_eval2 2>&1 +print_info $? st_quant_post_${algo}_bc_eval2 + +done +} + +# 2.3 quant_post_hpo # 小数据集 +demo_quant_quant_post_hpo(){ + +cd ${slim_dir}/demo/quant/quant_post_hpo || catchException demo_quant_quant_post_hpo +export CUDA_VISIBLE_DEVICES=${cudaid1} +# 1.导出模型 +python ../quant_post/export_model.py \ +--model "MobileNet" \ +--pretrained_model ../../pretrain/MobileNetV1_pretrained \ +--data imagenet > ${log_path}/st_quant_post__hpo_v1_export 2>&1 +print_info $? st_quant_post__hpo_v1_export +# 2. quant_post_hpo 设置max_model_quant_count=2 +python quant_post_hpo.py \ +--use_gpu=True \ +--model_path="./inference_model/MobileNet/" \ +--save_path="./inference_model/MobileNet_quant/" \ +--model_filename="model" \ +--params_filename="weights" \ +--max_model_quant_count=2 > ${log_path}/st_quant_post_hpo 2>&1 +print_info $? st_quant_post_hpo +# 3. 量化后eval +python ../quant_post/eval.py \ +--model_path ./inference_model/MobileNet_quant \ +--model_name __model__ \ +--params_name __params__ > ${log_path}/st_quant_post_hpo_eval 2>&1 +print_info $? st_quant_post_hpo_eval + +} + +#2.4 +demo_quant_pact_quant_aware(){ +cd ${slim_dir}/demo/quant/pact_quant_aware || catchException demo_quant_pact_quant_aware +export CUDA_VISIBLE_DEVICES=${cudaid1} +# 普通量化,使用小数据集即可 +# 2.1版本时默认BS=128 会报显存不足,故暂时修改成64 +python train.py --model MobileNetV3_large_x1_0 \ +--pretrained_model ../../pretrain/MobileNetV3_large_x1_0_ssld_pretrained \ +--num_epochs 1 --lr 0.0001 --use_pact False --batch_size 128 >${log_path}/demo_quant_pact_quant_aware_v3_nopact 2>&1 +print_info $? demo_quant_pact_quant_aware_v3_nopact +python train.py --model MobileNetV3_large_x1_0 \ +--pretrained_model ../../pretrain/MobileNetV3_large_x1_0_ssld_pretrained \ +--num_epochs 1 --lr 0.0001 --use_pact True --batch_size 64 --lr_strategy=piecewise_decay \ +--step_epochs 2 --l2_decay 1e-5 >${log_path}/demo_quant_pact_quant_aware_v3 2>&1 +print_info $? demo_quant_pact_quant_aware_v3 +# load +python train.py --model MobileNetV3_large_x1_0 \ +--pretrained_model ../../pretrain/MobileNetV3_large_x1_0_ssld_pretrained \ +--num_epochs 2 --lr 0.0001 --use_pact True --batch_size 64 --lr_strategy=piecewise_decay \ +--step_epochs 20 --l2_decay 1e-5 \ +--checkpoint_dir ./output/MobileNetV3_large_x1_0/0 \ +--checkpoint_epoch 0 >${log_path}/demo_quant_pact_quant_aware_v3_load 2>&1 +print_info $? demo_quant_pact_quant_aware_v3_load +} + +# 2.5 +demo_dygraph_quant(){ +cd ${slim_dir}/demo/dygraph/quant || catchException demo_dygraph_quant +CUDA_VISIBLE_DEVICES=${cudaid1} python train.py --model='mobilenet_v1' \ +--pretrained_model '../../pretrain/MobileNetV1_pretrained' \ +--num_epochs 1 \ +--batch_size 128 \ +> ${log_path}/dy_quant_v1_gpu1 2>&1 +print_info $? dy_quant_v1_gpu1 +# dy_pact_v3 +CUDA_VISIBLE_DEVICES=${cudaid1} python train.py --lr=0.001 \ +--batch_size 128 \ +--use_pact=True --num_epochs=1 --l2_decay=2e-5 --ls_epsilon=0.1 \ +--pretrained_model ../../pretrain/MobileNetV3_large_x1_0_ssld_pretrained \ +--num_epochs 1 > ${log_path}/dy_pact_quant_v3_gpu1 2>&1 +print_info $? dy_pact_quant_v3_gpu1 +# 多卡训练,以0到3号卡为例 +CUDA_VISIBLE_DEVICES=${cudaid2} python -m paddle.distributed.launch \ +train.py --lr=0.001 \ +--pretrained_model ../../pretrain/MobileNetV3_large_x1_0_ssld_pretrained \ +--use_pact=True --num_epochs=1 \ +--l2_decay=2e-5 \ +--ls_epsilon=0.1 \ +--batch_size=128 \ +--model_save_dir output > ${log_path}/dy_pact_quant_v3_gpu4 2>&1 +print_info $? dy_pact_quant_v3_gpu4 +} +# 2.6 +ce_tests_dygraph_qat(){ +cd ${slim_dir}/ce_tests/dygraph/quant || catchException ce_tests_dygraph_qat +ln -s ${slim_dir}/demo/data/ILSVRC2012 +test_samples=1000 # if set as -1, use all test samples +data_path='./ILSVRC2012/' +batch_size=16 +epoch=1 +lr=0.0001 +num_workers=1 +output_dir=$PWD/output_models +for model in mobilenet_v1 +do +# if [ $1 == nopact ];then + # 1 quant train + echo "------1 nopact train--------", ${model} + export CUDA_VISIBLE_DEVICES=${cudaid1} + python ./src/qat.py \ + --arch=${model} \ + --data=${data_path} \ + --epoch=${epoch} \ + --batch_size=32 \ + --num_workers=${num_workers} \ + --lr=${lr} \ + --output_dir=${output_dir} \ + --enable_quant > qat_${model}_gpu1_nw1 2>&1 + # 2 eval before save quant + echo "--------2 eval before save quant -------------", ${model} + python ./src/eval.py \ + --model_path=./output_models/quant_dygraph/${model} \ + --data_dir=${data_path} \ + --test_samples=${test_samples} \ + --batch_size=${batch_size} > eval_before_save_${model} 2>&1 + # 3 CPU上部署量化模型,需要使用`test/save_quant_model.py`脚本进行模型转换。 + echo "--------3 save_nopact_quant_model-------------", ${model} + python src/save_quant_model.py \ + --load_model_path output_models/quant_dygraph/${model} \ + --save_model_path int8_models/${model} > save_quant_${model} 2>&1 + # 4 + echo "--------4 CPU eval after save nopact quant -------------", ${model} + export CUDA_VISIBLE_DEVICES= + python ./src/eval.py \ + --model_path=./int8_models/${model} \ + --data_dir=${data_path} \ + --test_samples=${test_samples} \ + --batch_size=${batch_size} > cpu_eval_after_save_${model} 2>&1 +# elif [ $1 == pact ];then + # 1 pact quant train + echo "------1 pact train--------", ${model} + export CUDA_VISIBLE_DEVICES=${cudaid1} + python ./src/qat.py \ + --arch=${model} \ + --data=${data_path} \ + --epoch=${epoch} \ + --batch_size=32 \ + --num_workers=${num_workers} \ + --lr=${lr} \ + --output_dir=$PWD/output_models_pact/ \ + --enable_quant \ + --use_pact > pact_qat_${model}_gpu1_nw1 2>&1 + # 2 eval before save quant + echo "--------2 eval before save pact quant -------------", ${model} + python ./src/eval.py \ + --model_path=./output_models_pact/quant_dygraph/${model} \ + --data_dir=${data_path} \ + --test_samples=${test_samples} \ + --batch_size=${batch_size} > eval_before_pact_save_${model} 2>&1 + echo "--------3 save pact quant -------------", ${model} + python src/save_quant_model.py \ + --load_model_path output_models_pact/quant_dygraph/${model} \ + --save_model_path int8_models_pact/${model} > save_pact_quant_${model} 2>&1 + echo "--------4 CPU eval after save pact quant -------------", ${model} + python ./src/eval.py \ + --model_path=./int8_models_pact/${model} \ + --data_dir=${data_path} \ + --test_samples=${test_samples} \ + --batch_size=${batch_size} > cpu_eval_after_pact_save_${model} 2>&1 +# fi + +done +} + +ce_tests_dygraph_qat(){ +cd ${slim_dir}/ce_tests/dygraph/quant || catchException ce_tests_dygraph_qat4 +ln -s ${slim_dir}/demo/data/ILSVRC2012 +test_samples=1000 # if set as -1, use all test samples +data_path='./ILSVRC2012/' +batch_size=16 +epoch=1 +lr=0.0001 +num_workers=1 +output_dir=$PWD/output_models +for model in mobilenet_v1 +#for model in mobilenet_v1 mobilenet_v2 resnet50 vgg16 +do + +# if [ $1 == nopact ];then + # 1 quant train + echo "------1 nopact train--------", ${model} + export CUDA_VISIBLE_DEVICES=${cudaid1} + python ./src/qat.py \ + --arch=${model} \ + --data=${data_path} \ + --epoch=${epoch} \ + --batch_size=32 \ + --num_workers=${num_workers} \ + --lr=${lr} \ + --output_dir=${output_dir} \ + --enable_quant > qat_${model}_gpu1_nw1 2>&1 + # 2 eval before save quant + echo "--------2 eval before save quant -------------", ${model} + python ./src/eval.py \ + --model_path=./output_models/quant_dygraph/${model} \ + --data_dir=${data_path} \ + --test_samples=${test_samples} \ + --batch_size=${batch_size} > eval_before_save_${model} 2>&1 + # 3 CPU上部署量化模型,需要使用`test/save_quant_model.py`脚本进行模型转换。 + echo "--------3 save_nopact_quant_model-------------", ${model} + python src/save_quant_model.py \ + --load_model_path output_models/quant_dygraph/${model} \ + --save_model_path int8_models/${model} > save_quant_${model} 2>&1 + # 4 + echo "--------4 CPU eval after save nopact quant -------------", ${model} + export CUDA_VISIBLE_DEVICES= + python ./src/eval.py \ + --model_path=./int8_models/${model} \ + --data_dir=${data_path} \ + --test_samples=${test_samples} \ + --batch_size=${batch_size} > cpu_eval_after_save_${model} 2>&1 +# elif [ $1 == pact ];then + # 1 pact quant train + echo "------1 pact train--------", ${model} + export CUDA_VISIBLE_DEVICES=${cudaid1} + python ./src/qat.py \ + --arch=${model} \ + --data=${data_path} \ + --epoch=${epoch} \ + --batch_size=32 \ + --num_workers=${num_workers} \ + --lr=${lr} \ + --output_dir=$PWD/output_models_pact/ \ + --enable_quant \ + --use_pact > pact_qat_${model}_gpu1_nw1 2>&1 + # 2 eval before save quant + echo "--------2 eval before save pact quant -------------", ${model} + python ./src/eval.py \ + --model_path=./output_models_pact/quant_dygraph/${model} \ + --data_dir=${data_path} \ + --test_samples=${test_samples} \ + --batch_size=${batch_size} > eval_before_pact_save_${model} 2>&1 + echo "--------3 save pact quant -------------", ${model} + python src/save_quant_model.py \ + --load_model_path output_models_pact/quant_dygraph/${model} \ + --save_model_path int8_models_pact/${model} > save_pact_quant_${model} 2>&1 + echo "--------4 CPU eval after save pact quant -------------", ${model} + python ./src/eval.py \ + --model_path=./int8_models_pact/${model} \ + --data_dir=${data_path} \ + --test_samples=${test_samples} \ + --batch_size=${batch_size} > cpu_eval_after_pact_save_${model} 2>&1 +# fi + +done +} + +ce_tests_dygraph_ptq(){ +cd ${slim_dir}/ce_tests/dygraph/quant || catchException ce_tests_dygraph_ptq4 +ln -s ${slim_dir}/demo/data/ILSVRC2012 +test_samples=1000 # if set as -1, use all test samples +data_path='./ILSVRC2012/' +batch_size=32 +epoch=1 +output_dir="./output_ptq" +quant_batch_num=10 +quant_batch_size=10 +for model in mobilenet_v1 +#for model in mobilenet_v1 mobilenet_v2 resnet50 vgg16 + +do + echo "--------quantize model: ${model}-------------" + export CUDA_VISIBLE_DEVICES=${cudaid1} + # save ptq quant model + python ./src/ptq.py \ + --data=${data_path} \ + --arch=${model} \ + --quant_batch_num=${quant_batch_num} \ + --quant_batch_size=${quant_batch_size} \ + --output_dir=${output_dir} > ${log_path}/ptq_${model} 2>&1 + print_info $? ptq_${model} + + echo "-------- eval fp32_infer model -------------", ${model} + python ./src/test.py \ + --model_path=${output_dir}/${model}/fp32_infer \ + --data_dir=${data_path} \ + --batch_size=${batch_size} \ + --use_gpu=True \ + --test_samples=${test_samples} \ + --ir_optim=False > ${log_path}/ptq_eval_fp32_${model} 2>&1 + print_info $? ptq_eval_fp32_${model} + + echo "-------- eval int8_infer model -------------", ${model} + python ./src/test.py \ + --model_path=${output_dir}/${model}/int8_infer \ + --data_dir=${data_path} \ + --batch_size=${batch_size} \ + --use_gpu=False \ + --test_samples=${test_samples} \ + --ir_optim=False > ${log_path}/ptq_eval_int8_${model} 2>&1 + print_info $? ptq_eval_int8_${model} + +done +} + +#用于更新release分支下无ce_tests_dygraph_ptq case;release分支设置is_develop="False" +is_develop="True" + +all_quant(){ # 10个模型 + if [ "${is_develop}" == "True" ];then + #ce_tests_dygraph_ptq4 + ce_tests_dygraph_ptq + fi + demo_quant_quant_aware # 2个模型 + demo_quant_quant_embedding # 1个模型 + demo_quant_quant_post # 4个策略 + demo_dygraph_quant # 2个模型 + demo_quant_pact_quant_aware # 1个模型 + ce_tests_dygraph_qat # 4个模型 + #ce_tests_dygraph_qat4 + demo_quant_quant_post_hpo +} + +# 3 prune +demo_prune(){ +cd ${slim_dir}/demo/prune || catchException demo_prune +# 3.1 P0 prune + +if [ -d "models" ];then + rm -rf models +fi +export CUDA_VISIBLE_DEVICES=${cudaid1} +python train.py --model "MobileNet" --pruned_ratio 0.31 --data "imagenet" \ +--pretrained_model ../pretrain/MobileNetV1_pretrained/ --num_epochs 1 >${log_path}/prune_v1_T 2>&1 +print_info $? prune_v1_T + +#3.2 prune_fpgm +# slim_prune_fpgm_v1_T +# export CUDA_VISIBLE_DEVICES=${cudaid1} +# python train.py \ +# --model="MobileNet" \ +# --pretrained_model="../pretrain/MobileNetV1_pretrained" \ +# --data="imagenet" \ +# --pruned_ratio=0.3125 \ +# --lr=0.1 \ +# --num_epochs=1 \ +# --test_period=1 \ +# --step_epochs 30 60 90\ +# --l2_decay=3e-5 \ +# --lr_strategy="piecewise_decay" \ +# --criterion="geometry_median" \ +# --model_path="./fpgm_mobilenetv1_models" \ +# --save_inference True >${log_path}/slim_prune_fpgm_v1_T 2>&1 +# print_info $? slim_prune_fpgm_v1_T + +#slim_prune_fpgm_v2_T +export CUDA_VISIBLE_DEVICES=${cudaid1} +#v2 -50% +python train.py \ + --model="MobileNetV2" \ + --pretrained_model="../pretrain/MobileNetV2_pretrained" \ + --data="imagenet" \ + --pruned_ratio=0.325 \ + --lr=0.001 \ + --num_epochs=2 \ + --test_period=1 \ + --step_epochs 30 60 80 \ + --l2_decay=1e-4 \ + --lr_strategy="piecewise_decay" \ + --criterion="geometry_median" \ + --model_path="./output/fpgm_mobilenetv2_models" \ + --save_inference True >${log_path}/slim_prune_fpgm_v2_T 2>&1 +print_info $? slim_prune_fpgm_v2_T +python eval.py --model "MobileNetV2" --data "imagenet" \ +--model_path "./output/fpgm_mobilenetv2_models/0" >${log_path}/slim_prune_fpgm_v2_eval 2>&1 +print_info $? slim_prune_fpgm_v2_eval +# ResNet34 -50 +# export CUDA_VISIBLE_DEVICES=${cudaid1} +# python train.py \ +# --model="ResNet34" \ +# --pretrained_model="../pretrain/ResNet34_pretrained" \ +# --data="imagenet" \ +# --pruned_ratio=0.3125 \ +# --lr=0.001 \ +# --num_epochs=2 \ +# --test_period=1 \ +# --step_epochs 30 60 \ +# --l2_decay=1e-4 \ +# --lr_strategy="piecewise_decay" \ +# --criterion="geometry_median" \ +# --model_path="./output/fpgm_resnet34_50_models" \ +# --save_inference True >${log_path}/slim_prune_fpgm_resnet34_50_T 2>&1 +print_info $? slim_prune_fpgm_resnet34_50_T +python eval.py --model "ResNet34" --data "imagenet" \ +--model_path "./output/fpgm_resnet34_50_models/0" >${log_path}/slim_prune_fpgm_resnet34_50_eval 2>&1 +print_info $? slim_prune_fpgm_resnet34_50_eval +# ResNet34 -42 slim_prune_fpgm_resnet34_42_T +cd ${slim_dir}/demo/prune +export CUDA_VISIBLE_DEVICES=${cudaid1} +python train.py \ + --model="ResNet34" \ + --pretrained_model="../pretrain/ResNet34_pretrained" \ + --data="imagenet" \ + --pruned_ratio=0.25 \ + --num_epochs=2 \ + --test_period=1 \ + --lr_strategy="cosine_decay" \ + --criterion="geometry_median" \ + --model_path="./output/fpgm_resnet34_025_120_models" \ + --save_inference True >${log_path}/slim_prune_fpgm_resnet34_42_T 2>&1 +print_info $? slim_prune_fpgm_resnet34_42_T +python eval.py --model "ResNet34" --data "imagenet" \ +--model_path "./output/fpgm_resnet34_025_120_models/0" >${log_path}/slim_prune_fpgm_resnet34_42_eval 2>&1 +print_info $? slim_prune_fpgm_resnet34_42_eval +# 3.3 prune ResNet50 +export CUDA_VISIBLE_DEVICES=${cudaid1} +# 2.1版本时默认BS=256 会报显存不足,故暂时修改成128 +python train.py --model ResNet50 --pruned_ratio 0.31 --data "imagenet" \ +--save_inference True --pretrained_model ../pretrain/ResNet50_pretrained \ +--num_epochs 1 --batch_size 128 >${log_path}/prune_ResNet50_T 2>&1 +print_info $? prune_ResNet50_T +} + +# 3.4 dygraph_prune +#dy_prune_ResNet34_f42 +demo_dygraph_pruning(){ +cd ${slim_dir}/demo/dygraph/pruning || catchException demo_dygraph_pruning +ln -s ${slim_dir}/demo/data data +CUDA_VISIBLE_DEVICES=${cudaid1} python train.py \ + --use_gpu=True \ + --model="resnet34" \ + --data="imagenet" \ + --pruned_ratio=0.25 \ + --num_epochs=1 \ + --batch_size=128 \ + --lr_strategy="cosine_decay" \ + --criterion="fpgm" \ + --model_path="./fpgm_resnet34_025_120_models" >${log_path}/dy_prune_ResNet34_f42_gpu1 2>&1 +print_info $? dy_prune_ResNet34_f42_gpu1 +#2.3 恢复训练 通过设置checkpoint选项进行恢复训练: +CUDA_VISIBLE_DEVICES=${cudaid1} python train.py \ + --use_gpu=True \ + --model="resnet34" \ + --data="imagenet" \ + --pruned_ratio=0.25 \ + --num_epochs=2 \ + --batch_size=128 \ + --lr_strategy="cosine_decay" \ + --criterion="fpgm" \ + --model_path="./fpgm_resnet34_025_120_models" \ + --checkpoint="./fpgm_resnet34_025_120_models/0" >${log_path}/dy_prune_ResNet34_f42_gpu1_load 2>&1 +print_info $? dy_prune_ResNet34_f42_gpu1_load + +#2.4. 评估 通过调用eval.py脚本,对剪裁和重训练后的模型在测试数据上进行精度: +CUDA_VISIBLE_DEVICES=${cudaid1} python eval.py \ +--checkpoint=./fpgm_resnet34_025_120_models/1 \ +--model="resnet34" \ +--pruned_ratio=0.25 \ +--batch_size=128 >${log_path}/dy_prune_ResNet34_f42_gpu1_eval 2>&1 +print_info $? dy_prune_ResNet34_f42_gpu1_eval + +#2.5. 导出模型 执行以下命令导出用于预测的模型: +CUDA_VISIBLE_DEVICES=${cudaid1} python export_model.py \ +--checkpoint=./fpgm_resnet34_025_120_models/final \ +--model="resnet34" \ +--pruned_ratio=0.25 \ +--output_path=./infer_final/resnet > ${log_path}/dy_prune_ResNet34_f42_gpu1_export 2>&1 +print_info $? dy_prune_ResNet34_f42_gpu1_export + +#add dy_prune_fpgm_mobilenetv1_50_T +CUDA_VISIBLE_DEVICES=${cudaid2} python -m paddle.distributed.launch \ +--log_dir="fpgm_mobilenetv1_train_log" \ +train.py \ + --model="mobilenet_v1" \ + --data="imagenet" \ + --pruned_ratio=0.3125 \ + --lr=0.1 \ + --num_epochs=1 \ + --test_period=1 \ + --step_epochs 30 60 90\ + --l2_decay=3e-5 \ + --lr_strategy="piecewise_decay" \ + --criterion="fpgm" \ + --model_path="./fpgm_mobilenetv1_models" > ${log_path}/dy_prune_fpgm_mobilenetv1_50_T 2>&1 +print_info $? dy_prune_fpgm_mobilenetv1_50_T + +#add dy_prune_fpgm_mobilenetv2_50_T +# CUDA_VISIBLE_DEVICES=${cudaid2} python -m paddle.distributed.launch \ +# --log_dir="fpgm_mobilenetv2_train_log" \ +# train.py \ +# --model="mobilenet_v2" \ +# --data="imagenet" \ +# --pruned_ratio=0.325 \ +# --lr=0.001 \ +# --num_epochs=1 \ +# --test_period=1 \ +# --step_epochs 30 60 80\ +# --l2_decay=1e-4 \ +# --lr_strategy="piecewise_decay" \ +# --criterion="fpgm" \ +# --model_path="./fpgm_mobilenetv2_models" > ${log_path}/dy_prune_fpgm_mobilenetv2_50_T 2>&1 +# print_info $? dy_prune_fpgm_mobilenetv2_50_T + +#add +CUDA_VISIBLE_DEVICES=${cudaid2} python -m paddle.distributed.launch \ +--log_dir="fpgm_resnet34_f_42_train_log" \ +train.py \ + --use_gpu=True \ + --model="resnet34" \ + --data="imagenet" \ + --pruned_ratio=0.25 \ + --batch_size=128 \ + --num_epochs=1 \ + --test_period=1 \ + --lr_strategy="cosine_decay" \ + --criterion="fpgm" \ + --model_path="./fpgm_resnet34_025_120_models" > ${log_path}/dy_prune_ResNet34_f42_gpu2 2>&1 +print_info $? dy_prune_ResNet34_f42_gpu2 +} + +# 3.5 st unstructured_prune +demo_unstructured_prune(){ +cd ${slim_dir}/demo/unstructured_prune || catchException demo_unstructured_prune +# 注意,上述命令中的batch_size为多张卡上总的batch_size,即一张卡的batch_size为256。 +## sparsity: -30%, accuracy: 70%/89% +export CUDA_VISIBLE_DEVICES=${cudaid1} +python train.py \ +--batch_size 256 \ +--pretrained_model ../pretrain/MobileNetV1_pretrained \ +--lr 0.05 \ +--pruning_mode threshold \ +--threshold 0.01 \ +--data imagenet \ +--lr_strategy piecewise_decay \ +--step_epochs 1 2 3 \ +--num_epochs 1 \ +--test_period 1 \ +--model_period 1 \ +--model_path st_unstructured_models >${log_path}/st_unstructured_prune_threshold_T 2>&1 +print_info $? st_unstructured_prune_threshold_T +# eval +python evaluate.py \ + --pruned_model=st_unstructured_models \ + --data="imagenet" >${log_path}/st_unstructured_prune_threshold_eval 2>&1 +print_info $? st_unstructured_prune_threshold_eval + +## sparsity: -55%, accuracy: 67%+/87%+ +export CUDA_VISIBLE_DEVICES=${cudaid1} +python train.py \ +--batch_size 256 \ +--pretrained_model ../pretrain/MobileNetV1_pretrained \ +--lr 0.05 \ +--pruning_mode ratio \ +--ratio 0.55 \ +--data imagenet \ +--lr_strategy piecewise_decay \ +--step_epochs 1 2 3 \ +--num_epochs 1 \ +--test_period 1 \ +--model_period 1 \ +--model_path st_ratio_models >${log_path}/st_unstructured_prune_ratio_T 2>&1 +print_info $? st_unstructured_prune_ratio_T + +# MNIST数据集 +# python train.py \ +# --batch_size 256 \ +# --pretrained_model ../pretrain/MobileNetV1_pretrained \ +# --lr 0.05 \ +# --pruning_mode threshold \ +# --threshold 0.01 \ +# --data mnist \ +# --lr_strategy piecewise_decay \ +# --step_epochs 1 2 3 \ +# --num_epochs 1 \ +# --test_period 1 \ +# --model_period 1 \ +# --model_path st_unstructured_models_mnist >${log_path}/st_unstructured_prune_threshold_mnist_T 2>&1 +# print_info $? st_unstructured_prune_threshold_mnist_T +# eval +python evaluate.py \ + --pruned_model=st_unstructured_models_mnist \ + --data="mnist" >${log_path}/st_unstructured_prune_threshold_mnist_eval 2>&1 +print_info $? st_unstructured_prune_threshold_mnist_eval + +export CUDA_VISIBLE_DEVICES=${cudaid2} +python -m paddle.distributed.launch \ + --log_dir="st_unstructured_prune_gmp_log" \ + train.py \ + --batch_size 64 \ + --data imagenet \ + --pruning_mode ratio \ + --ratio 0.75 \ + --lr 0.005 \ + --model MobileNet \ + --num_epochs 1 \ + --test_period 5 \ + --model_period 10 \ + --pretrained_model ../pretrain/MobileNetV1_pretrained \ + --model_path "./models" \ + --step_epochs 71 88 \ + --initial_ratio 0.15 \ + --pruning_steps 5 \ + --stable_epochs 0 \ + --pruning_epochs 54 \ + --tunning_epochs 54 \ + --last_epoch -1 \ + --prune_params_type conv1x1_only \ + --pruning_strategy gmp > ${log_path}/st_unstructured_prune_ratio_gmp 2>&1 +print_info $? st_unstructured_prune_ratio_gmp +} +demo_dygraph_unstructured_pruning(){ +# dy_threshold +cd ${slim_dir}/demo/dygraph/unstructured_pruning || catchException demo_dygraph_unstructured_pruning +export CUDA_VISIBLE_DEVICES=${cudaid2} +## sparsity: -55%, accuracy: 67%+/87%+ +python -m paddle.distributed.launch \ +--log_dir train_dy_ratio_log train.py \ +--data imagenet \ +--lr 0.05 \ +--pruning_mode ratio \ +--ratio 0.55 \ +--batch_size 256 \ +--lr_strategy piecewise_decay \ +--step_epochs 1 2 3 \ +--num_epochs 1 \ +--test_period 1 \ +--model_period 1 \ +--model_path dy_ratio_models >${log_path}/dy_prune_ratio_T 2>&1 +print_info $? dy_prune_ratio_T + +## sparsity: -30%, accuracy: 70%/89% +export CUDA_VISIBLE_DEVICES=${cudaid2} +python -m paddle.distributed.launch \ +--log_dir train_dy_threshold_log train.py \ +--data imagenet \ +--lr 0.05 \ +--pruning_mode threshold \ +--threshold 0.01 \ +--batch_size 256 \ +--lr_strategy piecewise_decay \ +--step_epochs 1 2 3 \ +--num_epochs 1 \ +--test_period 1 \ +--model_period 1 \ +--model_path dy_threshold_models >${log_path}/dy_threshold_prune_T 2>&1 +print_info $? dy_threshold_prune_T +# eval +python evaluate.py --pruned_model dy_threshold_models/model.pdparams \ +--data imagenet >${log_path}/dy_threshold_prune_eval 2>&1 +print_info $? dy_threshold_prune_eval + +# load +python -m paddle.distributed.launch \ +--log_dir train_dy_threshold_load_log train.py \ +--data imagenet \ +--lr 0.05 \ +--pruning_mode threshold \ +--threshold 0.01 \ +--batch_size 256 \ +--lr_strategy piecewise_decay \ +--step_epochs 1 2 3 \ +--num_epochs 3 \ +--test_period 1 \ +--model_period 1 \ +--model_path dy_threshold_models_new \ +--pretrained_model dy_threshold_models/model.pdparams \ +--last_epoch 1 > ${log_path}/dy_threshold_prune_T_load 2>&1 +print_info $? dy_threshold_prune_T_load +# cifar10 +# python train.py --data cifar10 --lr 0.05 \ +# --pruning_mode threshold \ +# --threshold 0.01 \ +# --model_period 1 \ +# --num_epochs 2 >${log_path}/dy_threshold_prune_cifar10_T 2>&1 +# print_info $? dy_threshold_prune_cifar10_T + +export CUDA_VISIBLE_DEVICES=${cudaid2} +python -m paddle.distributed.launch \ + --log_dir="dy_unstructured_prune_gmp_log" \ + train.py \ + --batch_size 64 \ + --data imagenet \ + --pruning_mode ratio \ + --ratio 0.75 \ + --lr 0.005 \ + --num_epochs 1 \ + --test_period 5 \ + --model_period 10 \ + --model_path "./models" \ + --step_epochs 71 88 \ + --initial_ratio 0.15 \ + --pruning_steps 100 \ + --stable_epochs 0 \ + --pruning_epochs 54 \ + --tunning_epochs 54 \ + --last_epoch -1 \ + --pruning_strategy gmp \ + --skip_params_type exclude_conv1x1 ${log_path}/dy_unstructured_prune_ratio_gmp 2>&1 +print_info $? dy_unstructured_prune_ratio_gmp +} + +################## +all_prune(){ # 7个模型 + demo_prune + demo_dygraph_pruning + demo_unstructured_prune # 4个模型 + demo_dygraph_unstructured_pruning +} + +#4 nas +demo_nas(){ +# 4.1 sa_nas_mobilenetv2 +cd ${slim_dir}/demo/nas || catchException demo_nas +model=demo_nas_sa_nas_v2_T_1card +CUDA_VISIBLE_DEVICES=${cudaid1} python sa_nas_mobilenetv2.py --search_steps 1 --port 8881 >${log_path}/${model} 2>&1 +print_info $? ${model} +} +demo_nas4(){ +cd ${slim_dir}/demo/nas || catchException demo_nas4 +model=sa_nas_v2_T_1card +CUDA_VISIBLE_DEVICES=${cudaid1} python sa_nas_mobilenetv2.py --search_steps 1 --retain_epoch 1 --port 8881 >${log_path}/${model} 2>&1 +print_info $? ${model} +# 4.2 block_sa_nas_mobilenetv2 +model=block_sa_nas_v2_T_1card +CUDA_VISIBLE_DEVICES=${cudaid1} python block_sa_nas_mobilenetv2.py --search_steps 1 --port 8883 >${log_path}/${model} 2>&1 +print_info $? ${model} + +# 4.3 rl_nas +model=rl_nas_v2_T_1card +CUDA_VISIBLE_DEVICES=${cudaid1} python rl_nas_mobilenetv2.py --search_steps 1 --port 8885 >${log_path}/${model} 2>&1 +print_info $? ${model} + +# 4.4 parl_nas +#model=parl_nas_v2_T_1card +#CUDA_VISIBLE_DEVICES=${cudaid1} python parl_nas_mobilenetv2.py \ +#--search_steps 1 --port 8887 >${log_path}/${model} 2>&1 +#print_info $? ${model} +} + +all_nas(){ # 3 个模型 + demo_nas +} +# 5 darts +# search 1card # DARTS一阶近似搜索方法 +demo_darts(){ +cd ${slim_dir}/demo/darts || catchException demo_darts +model=darts1_search_1card +CUDA_VISIBLE_DEVICES=${cudaid1} python search.py --epochs 1 \ +--use_multiprocess False \ +--batch_size 32 >${log_path}/${model} 2>&1 +print_info $? ${model} +#train +model=pcdarts_train_1card +CUDA_VISIBLE_DEVICES=${cudaid1} python train.py --arch='PC_DARTS' \ +--epochs 1 --use_multiprocess False \ +--batch_size 32 >${log_path}/${model} 2>&1 +print_info $? ${model} +# 可视化 +#pip install graphviz +#model=slim_darts_visualize_pcdarts +#python visualize.py PC_DARTS > ${log_path}/${model} 2>&1 +#print_info $? ${model} +} + + + +slimfacenet(){ +cd ${slim_dir}/demo/slimfacenet || catchException slimfacenet +ln -s ${data_path}/slim/slimfacenet/CASIA CASIA +ln -s ${data_path}/slim/slimfacenet/lfw lfw +model=slim_slimfacenet_B75_train +CUDA_VISIBLE_DEVICES=${cudaid1} python -u train_eval.py \ +--train_data_dir=./CASIA/ --test_data_dir=./lfw/ \ +--action train --model=SlimFaceNet_B_x0_75 \ +--start_epoch 0 --total_epoch 1 >${log_path}/slim_slimfacenet_B75_train 2>&1 +print_info $? ${model} +model=slim_slimfacenet_B75_quan +CUDA_VISIBLE_DEVICES=${cudaid1} python train_eval.py \ +--action quant --train_data_dir=./CASIA/ \ +--test_data_dir=./lfw/ >${log_path}/slim_slimfacenet_B75_quan 2>&1 +print_info $? ${model} +model=slim_slimfacenet_B75_eval +CUDA_VISIBLE_DEVICES=${cudaid1} python train_eval.py \ +--action test --train_data_dir=./CASIA/ \ +--test_data_dir=./lfw/ >${log_path}/slim_slimfacenet_B75_eval 2>&1 +print_info $? ${model} +} + +all_darts(){ # 2个模型 + demo_darts + #slimfacenet 需要删掉 +} + +demo_latency(){ +cd ${slim_dir}/demo/analysis || catchException demo_latency +model=latency_mobilenet_v1_fp32 +python latency_predictor.py --model mobilenet_v1 --data_type fp32 >${log_path}/${model} 2>&1 +print_info $? ${model} +model=latency_mobilenet_v1_int8 +python latency_predictor.py --model mobilenet_v1 --data_type int8 >${log_path}/${model} 2>&1 +print_info $? ${model} +model=latency_mobilenet_v2_fp32 +python latency_predictor.py --model mobilenet_v2 --data_type fp32 >${log_path}/${model} 2>&1 +print_info $? ${model} +model=latency_mobilenet_v2_int8 +python latency_predictor.py --model mobilenet_v2 --data_type int8 >${log_path}/${model} 2>&1 +print_info $? ${model} +} + +all_latency(){ + demo_latency +} + +#################################### +export all_case_list=(all_distillation all_quant all_prune all_nas ) + +export all_case_time=0 +declare -A all_P0case_dic +all_case_dic=(["all_distillation"]=5 ["all_quant"]=15 ["all_prune"]=1 ["all_nas"]=30 ["all_darts"]=30 ['unstructured_prune']=15 ['dy_qat1']=1) +for key in $(echo ${!all_case_dic[*]});do + all_case_time=`expr ${all_case_time} + ${all_case_dic[$key]}` +done +set -e +echo -e "\033[35m ---- P0case_list length: ${#all_case_list[*]}, cases: ${all_case_list[*]} \033[0m" +echo -e "\033[35m ---- P0case_time: $all_case_time min \033[0m" +set +e +#################################### +echo -e "\033[35m ---- start run case \033[0m" +case_num=1 +for model in ${all_case_list[*]};do + echo -e "\033[35m ---- running P0case $case_num/${#all_case_list[*]}: ${model} , task time: ${all_case_list[${model}]} min \033[0m" + ${model} + let case_num++ +done +echo -e "\033[35m ---- end run case \033[0m" + +cd ${slim_dir}/logs +FF=`ls *FAIL*|wc -l` +if [ "${FF}" -gt "0" ];then + exit 1 +else + exit 0 +fi