From 053b1d370584c8b0966c0857724f1074107a0961 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 16 Oct 2019 11:38:08 +0800 Subject: [PATCH] Error out if fp16 is enabled and affine channel is used (#3508) * Error out if fp16 is enabled and affine channel is used * Add more documentation on mixed precision training --- configs/faster_rcnn_r50_fpn_1x.yml | 2 +- configs/mask_rcnn_r50_fpn_1x.yml | 2 +- docs/GETTING_STARTED.md | 16 ++++++++++++++++ docs/GETTING_STARTED_cn.md | 15 +++++++++++++++ tools/train.py | 6 ++++++ 5 files changed, 39 insertions(+), 2 deletions(-) diff --git a/configs/faster_rcnn_r50_fpn_1x.yml b/configs/faster_rcnn_r50_fpn_1x.yml index 8ddc6f6c9..c71910610 100644 --- a/configs/faster_rcnn_r50_fpn_1x.yml +++ b/configs/faster_rcnn_r50_fpn_1x.yml @@ -21,7 +21,7 @@ FasterRCNN: bbox_assigner: BBoxAssigner ResNet: - norm_type: affine_channel + norm_type: bn norm_decay: 0. depth: 50 feature_maps: [2, 3, 4, 5] diff --git a/configs/mask_rcnn_r50_fpn_1x.yml b/configs/mask_rcnn_r50_fpn_1x.yml index 3a76395ee..a889ea283 100644 --- a/configs/mask_rcnn_r50_fpn_1x.yml +++ b/configs/mask_rcnn_r50_fpn_1x.yml @@ -24,7 +24,7 @@ ResNet: depth: 50 feature_maps: [2, 3, 4, 5] freeze_at: 2 - norm_type: affine_channel + norm_type: bn FPN: max_level: 6 diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md index 788f7f408..ca3a6df0a 100644 --- a/docs/GETTING_STARTED.md +++ b/docs/GETTING_STARTED.md @@ -80,6 +80,22 @@ python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml \ finetune_exclude_pretrained_params = ['cls_score','bbox_pred'] ``` +- Mixed Precision Training + +Mixed precision training can be enabled with `--fp16` flag. Currently Faster-FPN, Mask-FPN and Yolov3 have been verified to be working with little to no loss of precision (less than 0.2 mAP) + +To speed up mixed precision training, it is recommended to train in multi-process mode, for example + +```bash +export PYTHONPATH=$PYTHONPATH:. +python -m paddle.distributed.launch --selected_gpus 0,1,2,3,4,5,6,7 tools/train.py --fp16 -c configs/faster_rcnn_r50_fpn_1x.yml +``` + +If loss becomes `NaN` during training, try tweak the `--loss_scale` value. Please refer to the Nvidia [documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#mptrain) on mixed precision training for details. + +Also, please note mixed precision training currently requires changing `norm_type` from `affine_channel` to `bn`. + + ##### NOTES - `CUDA_VISIBLE_DEVICES` can specify different gpu numbers. Such as: `export CUDA_VISIBLE_DEVICES=0,1,2,3`. GPU calculation rules can refer [FAQ](#faq) diff --git a/docs/GETTING_STARTED_cn.md b/docs/GETTING_STARTED_cn.md index 2f0dff5fe..9bbd12892 100644 --- a/docs/GETTING_STARTED_cn.md +++ b/docs/GETTING_STARTED_cn.md @@ -81,6 +81,21 @@ python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml \ finetune_exclude_pretrained_params = ['cls_score','bbox_pred'] ``` +- 混合精度训练 + +通过设置 `--fp16` 命令行选项可以启用混合精度训练。目前混合精度训练已经在Faster-FPN, Mask-FPN 及 Yolov3 上进行验证,几乎没有精度损失(小于0.2 mAP)。 + +建议使用多进程方式来进一步加速混合精度训练。示例如下。 + +```bash +export PYTHONPATH=$PYTHONPATH:. +python -m paddle.distributed.launch --selected_gpus 0,1,2,3,4,5,6,7 tools/train.py --fp16 -c configs/faster_rcnn_r50_fpn_1x.yml +``` + +如果训练过程中loss出现`NaN`,请尝试调节`--loss_scale`选项数值,细节请参看混合精度训练相关的[Nvidia文档](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#mptrain)。 + +另外,请注意将配置文件中的 `norm_type` 由 `affine_channel` 改为 `bn`。 + ##### 提示 - `CUDA_VISIBLE_DEVICES` 参数可以指定不同的GPU。例如: `export CUDA_VISIBLE_DEVICES=0,1,2,3`. GPU计算规则可以参考 [FAQ](#faq) diff --git a/tools/train.py b/tools/train.py index 27551c599..b58cacfd1 100644 --- a/tools/train.py +++ b/tools/train.py @@ -122,6 +122,12 @@ def main(): model = create(main_arch) train_pyreader, feed_vars = create_feed(train_feed) + if FLAGS.fp16: + assert (getattr(model.backbone, 'norm_type', None) + != 'affine_channel'), \ + '--fp16 currently does not support affine channel, ' \ + ' please modify backbone settings to use batch norm' + with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: train_fetches = model.train(feed_vars) -- GitLab