From 919a9b15d8a99ecc41c848b3521d3d6744c3bcf9 Mon Sep 17 00:00:00 2001 From: Guanghua Yu <742925032@qq.com> Date: Wed, 29 Jun 2022 17:02:54 +0800 Subject: [PATCH] fix quant aware distributed train (#1206) --- demo/quant/pact_quant_aware/README.md | 16 +++++++++++++--- demo/quant/pact_quant_aware/train.py | 21 +++++++++++++-------- demo/quant/quant_aware/README.md | 13 +++++++++++++ demo/quant/quant_aware/train.py | 26 +++++++++++++++----------- 4 files changed, 54 insertions(+), 22 deletions(-) diff --git a/demo/quant/pact_quant_aware/README.md b/demo/quant/pact_quant_aware/README.md index 23139e56..7f9c01c4 100644 --- a/demo/quant/pact_quant_aware/README.md +++ b/demo/quant/pact_quant_aware/README.md @@ -157,7 +157,7 @@ compiled_train_prog = compiled_train_prog.with_data_parallel( ### 训练命令 -普通量化: +- 普通量化: ``` python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/MobileNetV3_large_x1_0_ssld_pretrained --num_epochs 30 --lr 0.0001 --use_pact False @@ -177,14 +177,24 @@ python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/Mob ``` 可以看到普通量化loss不稳定,而且在实验进行到2个epoch时,loss会变为nan。普通量化很不稳定 -使用PACT量化训练 + +- 使用PACT量化训练 ``` # 先分析MobileNetV3模型激活值分布,来初始化PACT截断阈值 python train.py --analysis=True -# 启动PACT量化训练 +``` + +单卡启动PACT量化训练: +``` +export CUDA_VISIBLE_DEVICES=0 python train.py ``` +多卡启动PACT量化训练: +``` +CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 train.py --batch_size=64 +``` + 输出结果为 ``` 2020-06-05 15:25:37,647-INFO: epoch[0]-batch[10] - loss: 1.60160636902; acc_top1: 0.65625; acc_top5: 0.890625; time: 1.56788897514 diff --git a/demo/quant/pact_quant_aware/train.py b/demo/quant/pact_quant_aware/train.py index 0aad4d68..fb70c0fc 100644 --- a/demo/quant/pact_quant_aware/train.py +++ b/demo/quant/pact_quant_aware/train.py @@ -109,6 +109,14 @@ def create_optimizer(args): return cosine_decay(args) +def _prepare_envs(): + devices = paddle.device.get_device().split(':')[0] + places = paddle.device._convert_to_place(devices) + _logger.info(f"devices: {devices}") + exe = paddle.static.Executor(places) + return exe, places + + def compress(args): num_workers = 4 shuffle = True @@ -158,10 +166,7 @@ def compress(args): learning_rate, opt = create_optimizer(args) opt.minimize(avg_cost) - place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() - places = paddle.static.cuda_places( - ) if args.use_gpu else paddle.static.cpu_places() - exe = paddle.static.Executor(place) + exe, places = _prepare_envs() exe.run(paddle.static.default_startup_program()) train_loader = paddle.io.DataLoader( @@ -177,7 +182,7 @@ def compress(args): valid_loader = paddle.io.DataLoader( val_dataset, - places=place, + places=places, feed_list=[image, label], drop_last=False, return_list=False, @@ -290,7 +295,7 @@ def compress(args): val_program = quant_aware( val_program, - place, + places, quant_config, scope=None, act_preprocess_func=act_preprocess_func, @@ -299,7 +304,7 @@ def compress(args): for_test=True) compiled_train_prog = quant_aware( train_prog, - place, + places, quant_config, scope=None, act_preprocess_func=act_preprocess_func, @@ -420,7 +425,7 @@ def compress(args): # 3. Freeze the graph after training by adjusting the quantize # operators' order for the inference. # The dtype of float_program's weights is float32, but in int8 range. - float_program, int8_program = convert(val_program, place, quant_config, \ + float_program, int8_program = convert(val_program, places, quant_config, \ scope=None, \ save_int8=True) _logger.info("eval best_model after convert") diff --git a/demo/quant/quant_aware/README.md b/demo/quant/quant_aware/README.md index 4d0a769a..6422d57f 100644 --- a/demo/quant/quant_aware/README.md +++ b/demo/quant/quant_aware/README.md @@ -68,7 +68,20 @@ compiled_train_prog = compiled_train_prog.with_data_parallel( ### 训练命令 +- 单卡启动: + ``` +export CUDA_VISIBLE_DEVICES=0 python train.py --model MobileNet --pretrained_model ./pretrain/MobileNetV1_pretrained --checkpoint_dir ./output/mobilenetv1 --num_epochs 30 ``` + +- 多卡启动: +``` +CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 train.py \ + --model MobileNet \ + --pretrained_model ./pretrain/MobileNetV1_pretrained \ + --checkpoint_dir ./output/mobilenetv1 \ + --num_epochs 30 +``` + 运行之后,可看到``best_model``的最后测试结果,和MobileNet量化前的精度top1=70.99%, top5=89.68%非常相近。 diff --git a/demo/quant/quant_aware/train.py b/demo/quant/quant_aware/train.py index c42700ad..abf6073e 100644 --- a/demo/quant/quant_aware/train.py +++ b/demo/quant/quant_aware/train.py @@ -25,10 +25,10 @@ _logger = get_logger(__name__, level=logging.INFO) parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('batch_size', int, 64 * 4, "Minibatch size.") +add_arg('batch_size', int, 64, "Minibatch size.") add_arg('use_gpu', bool, True, "Whether to use GPU or not.") add_arg('model', str, "MobileNet", "The target model.") -add_arg('pretrained_model', str, "../pretrained_model/MobileNetV1_pretrained", "Whether to use pretrained model.") +add_arg('pretrained_model', str, "./pretrain/MobileNetV1_pretrained", "Whether to use pretrained model.") add_arg('lr', float, 0.0001, "The learning rate used to fine-tune pruned model.") add_arg('lr_strategy', str, "piecewise_decay", "The learning rate decay strategy.") add_arg('l2_decay', float, 3e-5, "The l2_decay parameter.") @@ -84,6 +84,14 @@ def create_optimizer(args): return cosine_decay(args) +def _prepare_envs(): + devices = paddle.device.get_device().split(':')[0] + places = paddle.device._convert_to_place(devices) + _logger.info(f"devices: {devices}") + exe = paddle.static.Executor(places) + return exe, places + + def compress(args): num_workers = 4 shuffle = True @@ -161,7 +169,7 @@ def compress(args): train_prog = paddle.static.default_main_program() val_program = paddle.static.default_main_program().clone(for_test=True) - place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace() + exe, places = _prepare_envs() ############################################################################################################ # 2. quantization transform programs (training aware) # Make some quantization transforms in the graph before training and testing. @@ -169,13 +177,12 @@ def compress(args): # some fake quantize operators and fake dequantize operators. ############################################################################################################ val_program = quant_aware( - val_program, place, quant_config, scope=None, for_test=True) + val_program, places, quant_config, scope=None, for_test=True) compiled_train_prog = quant_aware( - train_prog, place, quant_config, scope=None, for_test=False) + train_prog, places, quant_config, scope=None, for_test=False) opt = create_optimizer(args) opt.minimize(avg_cost) - exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) if pretrain: @@ -185,9 +192,6 @@ def compress(args): if args.pretrained_model: paddle.static.load(train_prog, args.pretrained_model, exe) - places = paddle.static.cuda_places( - ) if args.use_gpu else paddle.static.cpu_places() - train_loader = paddle.io.DataLoader( train_dataset, places=places, @@ -200,7 +204,7 @@ def compress(args): num_workers=num_workers) valid_loader = paddle.io.DataLoader( val_dataset, - places=place, + places=places, feed_list=[image, label], drop_last=False, return_list=False, @@ -290,7 +294,7 @@ def compress(args): # operators' order for the inference. # The dtype of float_program's weights is float32, but in int8 range. ############################################################################################################ - float_program, int8_program = convert(val_program, place, quant_config, \ + float_program, int8_program = convert(val_program, places, quant_config, \ scope=None, \ save_int8=True, onnx_format=args.onnx_format) -- GitLab