未验证 提交 919a9b15 编写于 作者: G Guanghua Yu 提交者: GitHub

fix quant aware distributed train (#1206)

上级 f275cefa
......@@ -157,7 +157,7 @@ compiled_train_prog = compiled_train_prog.with_data_parallel(
### 训练命令
普通量化:
- 普通量化:
```
python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/MobileNetV3_large_x1_0_ssld_pretrained --num_epochs 30 --lr 0.0001 --use_pact False
......@@ -177,14 +177,24 @@ python train.py --model MobileNetV3_large_x1_0 --pretrained_model ./pretrain/Mob
```
可以看到普通量化loss不稳定,而且在实验进行到2个epoch时,loss会变为nan。普通量化很不稳定
使用PACT量化训练
- 使用PACT量化训练
```
# 先分析MobileNetV3模型激活值分布,来初始化PACT截断阈值
python train.py --analysis=True
# 启动PACT量化训练
```
单卡启动PACT量化训练:
```
export CUDA_VISIBLE_DEVICES=0
python train.py
```
多卡启动PACT量化训练:
```
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 train.py --batch_size=64
```
输出结果为
```
2020-06-05 15:25:37,647-INFO: epoch[0]-batch[10] - loss: 1.60160636902; acc_top1: 0.65625; acc_top5: 0.890625; time: 1.56788897514
......
......@@ -109,6 +109,14 @@ def create_optimizer(args):
return cosine_decay(args)
def _prepare_envs():
devices = paddle.device.get_device().split(':')[0]
places = paddle.device._convert_to_place(devices)
_logger.info(f"devices: {devices}")
exe = paddle.static.Executor(places)
return exe, places
def compress(args):
num_workers = 4
shuffle = True
......@@ -158,10 +166,7 @@ def compress(args):
learning_rate, opt = create_optimizer(args)
opt.minimize(avg_cost)
place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
places = paddle.static.cuda_places(
) if args.use_gpu else paddle.static.cpu_places()
exe = paddle.static.Executor(place)
exe, places = _prepare_envs()
exe.run(paddle.static.default_startup_program())
train_loader = paddle.io.DataLoader(
......@@ -177,7 +182,7 @@ def compress(args):
valid_loader = paddle.io.DataLoader(
val_dataset,
places=place,
places=places,
feed_list=[image, label],
drop_last=False,
return_list=False,
......@@ -290,7 +295,7 @@ def compress(args):
val_program = quant_aware(
val_program,
place,
places,
quant_config,
scope=None,
act_preprocess_func=act_preprocess_func,
......@@ -299,7 +304,7 @@ def compress(args):
for_test=True)
compiled_train_prog = quant_aware(
train_prog,
place,
places,
quant_config,
scope=None,
act_preprocess_func=act_preprocess_func,
......@@ -420,7 +425,7 @@ def compress(args):
# 3. Freeze the graph after training by adjusting the quantize
# operators' order for the inference.
# The dtype of float_program's weights is float32, but in int8 range.
float_program, int8_program = convert(val_program, place, quant_config, \
float_program, int8_program = convert(val_program, places, quant_config, \
scope=None, \
save_int8=True)
_logger.info("eval best_model after convert")
......
......@@ -68,7 +68,20 @@ compiled_train_prog = compiled_train_prog.with_data_parallel(
### 训练命令
- 单卡启动:
```
export CUDA_VISIBLE_DEVICES=0
python train.py --model MobileNet --pretrained_model ./pretrain/MobileNetV1_pretrained --checkpoint_dir ./output/mobilenetv1 --num_epochs 30
```
- 多卡启动:
```
CUDA_VISIBLE_DEVICES=0,1,2,3 python -m paddle.distributed.launch --log_dir=log --gpus 0,1,2,3 train.py \
--model MobileNet \
--pretrained_model ./pretrain/MobileNetV1_pretrained \
--checkpoint_dir ./output/mobilenetv1 \
--num_epochs 30
```
运行之后,可看到``best_model``的最后测试结果,和MobileNet量化前的精度top1=70.99%, top5=89.68%非常相近。
......@@ -25,10 +25,10 @@ _logger = get_logger(__name__, level=logging.INFO)
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 64 * 4, "Minibatch size.")
add_arg('batch_size', int, 64, "Minibatch size.")
add_arg('use_gpu', bool, True, "Whether to use GPU or not.")
add_arg('model', str, "MobileNet", "The target model.")
add_arg('pretrained_model', str, "../pretrained_model/MobileNetV1_pretrained", "Whether to use pretrained model.")
add_arg('pretrained_model', str, "./pretrain/MobileNetV1_pretrained", "Whether to use pretrained model.")
add_arg('lr', float, 0.0001, "The learning rate used to fine-tune pruned model.")
add_arg('lr_strategy', str, "piecewise_decay", "The learning rate decay strategy.")
add_arg('l2_decay', float, 3e-5, "The l2_decay parameter.")
......@@ -84,6 +84,14 @@ def create_optimizer(args):
return cosine_decay(args)
def _prepare_envs():
devices = paddle.device.get_device().split(':')[0]
places = paddle.device._convert_to_place(devices)
_logger.info(f"devices: {devices}")
exe = paddle.static.Executor(places)
return exe, places
def compress(args):
num_workers = 4
shuffle = True
......@@ -161,7 +169,7 @@ def compress(args):
train_prog = paddle.static.default_main_program()
val_program = paddle.static.default_main_program().clone(for_test=True)
place = paddle.CUDAPlace(0) if args.use_gpu else paddle.CPUPlace()
exe, places = _prepare_envs()
############################################################################################################
# 2. quantization transform programs (training aware)
# Make some quantization transforms in the graph before training and testing.
......@@ -169,13 +177,12 @@ def compress(args):
# some fake quantize operators and fake dequantize operators.
############################################################################################################
val_program = quant_aware(
val_program, place, quant_config, scope=None, for_test=True)
val_program, places, quant_config, scope=None, for_test=True)
compiled_train_prog = quant_aware(
train_prog, place, quant_config, scope=None, for_test=False)
train_prog, places, quant_config, scope=None, for_test=False)
opt = create_optimizer(args)
opt.minimize(avg_cost)
exe = paddle.static.Executor(place)
exe.run(paddle.static.default_startup_program())
if pretrain:
......@@ -185,9 +192,6 @@ def compress(args):
if args.pretrained_model:
paddle.static.load(train_prog, args.pretrained_model, exe)
places = paddle.static.cuda_places(
) if args.use_gpu else paddle.static.cpu_places()
train_loader = paddle.io.DataLoader(
train_dataset,
places=places,
......@@ -200,7 +204,7 @@ def compress(args):
num_workers=num_workers)
valid_loader = paddle.io.DataLoader(
val_dataset,
places=place,
places=places,
feed_list=[image, label],
drop_last=False,
return_list=False,
......@@ -290,7 +294,7 @@ def compress(args):
# operators' order for the inference.
# The dtype of float_program's weights is float32, but in int8 range.
############################################################################################################
float_program, int8_program = convert(val_program, place, quant_config, \
float_program, int8_program = convert(val_program, places, quant_config, \
scope=None, \
save_int8=True,
onnx_format=args.onnx_format)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册