From 053b1d370584c8b0966c0857724f1074107a0961 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yangzhang@live.com>
Date: Wed, 16 Oct 2019 11:38:08 +0800
Subject: [PATCH] Error out if fp16 is enabled and affine channel is used
 (#3508)

* Error out if fp16 is enabled and affine channel is used

* Add more documentation on mixed precision training
---
 configs/faster_rcnn_r50_fpn_1x.yml |  2 +-
 configs/mask_rcnn_r50_fpn_1x.yml   |  2 +-
 docs/GETTING_STARTED.md            | 16 ++++++++++++++++
 docs/GETTING_STARTED_cn.md         | 15 +++++++++++++++
 tools/train.py                     |  6 ++++++
 5 files changed, 39 insertions(+), 2 deletions(-)

diff --git a/configs/faster_rcnn_r50_fpn_1x.yml b/configs/faster_rcnn_r50_fpn_1x.yml
index 8ddc6f6c9..c71910610 100644
--- a/configs/faster_rcnn_r50_fpn_1x.yml
+++ b/configs/faster_rcnn_r50_fpn_1x.yml
@@ -21,7 +21,7 @@ FasterRCNN:
   bbox_assigner: BBoxAssigner
 
 ResNet:
-  norm_type: affine_channel
+  norm_type: bn
   norm_decay: 0.
   depth: 50
   feature_maps: [2, 3, 4, 5]
diff --git a/configs/mask_rcnn_r50_fpn_1x.yml b/configs/mask_rcnn_r50_fpn_1x.yml
index 3a76395ee..a889ea283 100644
--- a/configs/mask_rcnn_r50_fpn_1x.yml
+++ b/configs/mask_rcnn_r50_fpn_1x.yml
@@ -24,7 +24,7 @@ ResNet:
   depth: 50
   feature_maps: [2, 3, 4, 5]
   freeze_at: 2
-  norm_type: affine_channel
+  norm_type: bn
 
 FPN:
   max_level: 6
diff --git a/docs/GETTING_STARTED.md b/docs/GETTING_STARTED.md
index 788f7f408..ca3a6df0a 100644
--- a/docs/GETTING_STARTED.md
+++ b/docs/GETTING_STARTED.md
@@ -80,6 +80,22 @@ python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml \
                             finetune_exclude_pretrained_params = ['cls_score','bbox_pred']
 ```
 
+- Mixed Precision Training
+
+Mixed precision training can be enabled with `--fp16` flag. Currently Faster-FPN, Mask-FPN and Yolov3 have been verified to be working with little to no loss of precision (less than 0.2 mAP)
+
+To speed up mixed precision training, it is recommended to train in multi-process mode, for example
+
+```bash
+export PYTHONPATH=$PYTHONPATH:.
+python -m paddle.distributed.launch --selected_gpus 0,1,2,3,4,5,6,7 tools/train.py --fp16 -c configs/faster_rcnn_r50_fpn_1x.yml
+```
+
+If loss becomes `NaN` during training, try tweak the `--loss_scale` value. Please refer to the Nvidia [documentation](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#mptrain) on mixed precision training for details.
+
+Also, please note mixed precision training currently requires changing `norm_type` from `affine_channel` to `bn`.
+
+
 ##### NOTES
 
 - `CUDA_VISIBLE_DEVICES` can specify different gpu numbers. Such as: `export CUDA_VISIBLE_DEVICES=0,1,2,3`. GPU calculation rules can refer [FAQ](#faq)
diff --git a/docs/GETTING_STARTED_cn.md b/docs/GETTING_STARTED_cn.md
index 2f0dff5fe..9bbd12892 100644
--- a/docs/GETTING_STARTED_cn.md
+++ b/docs/GETTING_STARTED_cn.md
@@ -81,6 +81,21 @@ python -u tools/train.py -c configs/faster_rcnn_r50_1x.yml \
                             finetune_exclude_pretrained_params = ['cls_score','bbox_pred']
 ```
 
+- 混合精度训练
+
+通过设置 `--fp16` 命令行选项可以启用混合精度训练。目前混合精度训练已经在Faster-FPN, Mask-FPN 及 Yolov3 上进行验证，几乎没有精度损失（小于0.2 mAP)。
+
+建议使用多进程方式来进一步加速混合精度训练。示例如下。
+
+```bash
+export PYTHONPATH=$PYTHONPATH:.
+python -m paddle.distributed.launch --selected_gpus 0,1,2,3,4,5,6,7 tools/train.py --fp16 -c configs/faster_rcnn_r50_fpn_1x.yml
+```
+
+如果训练过程中loss出现`NaN`，请尝试调节`--loss_scale`选项数值，细节请参看混合精度训练相关的[Nvidia文档](https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html#mptrain)。
+
+另外，请注意将配置文件中的 `norm_type` 由 `affine_channel` 改为 `bn`。
+
 ##### 提示
 
 - `CUDA_VISIBLE_DEVICES` 参数可以指定不同的GPU。例如: `export CUDA_VISIBLE_DEVICES=0,1,2,3`. GPU计算规则可以参考 [FAQ](#faq)
diff --git a/tools/train.py b/tools/train.py
index 27551c599..b58cacfd1 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -122,6 +122,12 @@ def main():
             model = create(main_arch)
             train_pyreader, feed_vars = create_feed(train_feed)
 
+            if FLAGS.fp16:
+                assert (getattr(model.backbone, 'norm_type', None)
+                        != 'affine_channel'), \
+                    '--fp16 currently does not support affine channel, ' \
+                    ' please modify backbone settings to use batch norm'
+
             with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
                 train_fetches = model.train(feed_vars)
 
-- 
GitLab