From 801edd0da964c4ce8f5d8ff56c9912c2e8f81883 Mon Sep 17 00:00:00 2001
From: Zhen Wang <wangzhen31@baidu.com>
Date: Fri, 17 Jan 2020 15:18:10 +0800
Subject: [PATCH] Add the fp16 support for ResNet50 (#4185)

* add the fp16 support for ResNet50

* add docs for AMP.
---
 PaddleCV/image_classification/README.md       | 32 +++++++---
 PaddleCV/image_classification/README_en.md    | 31 +++++++---
 PaddleCV/image_classification/build_model.py  | 13 +++-
 .../image_classification/models/resnet.py     | 60 ++++++++++++-------
 .../scripts/train/ResNet50_fp16.sh            | 40 +++++++++++++
 .../image_classification/utils/utility.py     | 17 ++++--
 6 files changed, 146 insertions(+), 47 deletions(-)
 create mode 100755 PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh

diff --git a/PaddleCV/image_classification/README.md b/PaddleCV/image_classification/README.md
index f6901ffb..507932b4 100644
--- a/PaddleCV/image_classification/README.md
+++ b/PaddleCV/image_classification/README.md
@@ -365,15 +365,31 @@ Mixup相关介绍参考[mixup: Beyond Empirical Risk Minimization](https://arxiv
 
 ### 混合精度训练
 
-通过指定--fp16=True 启动混合训练，在训练过程中会使用float16数据，并输出float32的模型参数。您可能需要同时传入--scale_loss来解决fp16训练的精度问题，通常传入--scale_loss=0.8即可。
+通过指定--use_fp16=True 启动混合精度训练，在训练过程中会使用float16数据类型，并输出float32的模型参数。您可能需要同时传入--scale_loss来解决fp16训练的精度问题，如传入--scale_loss=128.0。
 
-```bash
-python train.py \
-    --model=ResNet50 \
-    --fp16=True \
-    --scale_loss=0.8
-```
-具体内容也可参考[Fleet](https://github.com/PaddlePaddle/Fleet/tree/develop/benchmark/collective/resnet)
+在配置好数据集路径后（修改[scripts/train/ResNet50_fp16.sh](scripts/train/ResNet50_fp16.sh)文件中`DATA_DIR`的值），对ResNet50模型进行混合精度训练可通过运行`bash run.sh train ResNet50_fp16`命令完成。
+
+多机多卡ResNet50模型的混合精度训练请参考[PaddlePaddle/Fleet](https://github.com/PaddlePaddle/Fleet/tree/develop/benchmark/collective/resnet)。
+
+使用Tesla V100单机8卡、2机器16卡、4机器32卡，对ResNet50模型进行混合精度训练的结果如下（开启DALI）：
+
+* BatchSize = 256
+
+节点数*卡数|吞吐|加速比|test\_acc1|test\_acc5
+---|---|---|---|---
+1*1|1035 ins/s|1|0.75333|0.92702 
+1*8|7840 ins/s|7.57|0.75603|0.92771
+2*8|14277 ins/s|13.79|0.75872|0.92793
+4*8|28594 ins/s|27.63|0.75253|0.92713
+
+* BatchSize = 128
+
+节点数*卡数|吞吐|加速比|test\_acc1|test\_acc5
+---|---|---|---|---
+1*1|936  ins/s|1|0.75280|0.92531
+1*8|7108 ins/s|7.59|0.75832|0.92771
+2*8|12343 ins/s|13.18|0.75766|0.92723
+4*8|24407 ins/s|26.07|0.75859|0.92871
 
 ### 性能分析
 
diff --git a/PaddleCV/image_classification/README_en.md b/PaddleCV/image_classification/README_en.md
index 1f1cbbf3..437cd1ba 100644
--- a/PaddleCV/image_classification/README_en.md
+++ b/PaddleCV/image_classification/README_en.md
@@ -253,16 +253,31 @@ Refer to [mixup: Beyond Empirical Risk Minimization](https://arxiv.org/abs/1710.
 
 ### Using Mixed-Precision Training
 
-Set --fp16=True to sart Mixed-Precision Training.
+Set --use_fp16=True to sart Automatic Mixed Precision (AMP) Training. During the training process, the float16 data type will be used to speed up the training performance. You may need to use the --scale_loss parameter to avoid the accuracy dropping, such as setting --scale_loss=128.0.
 
-```bash
-python train.py \
-    --model=ResNet50 \
-    --fp16=True \
-    --scale_loss=0.8
-```
+After configuring the data path (modify the value of `DATA_DIR` in [scripts/train/ResNet50_fp16.sh](scripts/train/ResNet50_fp16.sh)), you can enable ResNet50 to start AMP Training by executing the command of `bash run.sh train ResNet50_fp16`.
+
+Refer to [PaddlePaddle/Fleet](https://github.com/PaddlePaddle/Fleet/tree/develop/benchmark/collective/resnet) for the multi-machine and multi-card training.
+
+Performing on Tesla V100 single machine with 8 cards, two machines with 16 cards and four machines with 32 cards, the performance of ResNet50 AMP training is shown as below (enable DALI).
+
+* BatchSize = 256
+
+nodes*crads|throughput|speedup|test\_acc1|test\_acc5
+---|---|---|---|---
+1*1|1035 ins/s|1|0.75333|0.92702 
+1*8|7840 ins/s|7.57|0.75603|0.92771
+2*8|14277 ins/s|13.79|0.75872|0.92793
+4*8|28594 ins/s|27.63|0.75253|0.92713
+
+* BatchSize = 128
 
-Refer to [PaddlePaddle/Fleet](https://github.com/PaddlePaddle/Fleet/tree/develop/benchmark/collective/resnet)
+nodes*crads|throughput|speedup|test\_acc1|test\_acc5
+---|---|---|---|---
+1*1|936  ins/s|1|0.75280|0.92531
+1*8|7108 ins/s|7.59|0.75832|0.92771
+2*8|12343 ins/s|13.18|0.75766|0.92723
+4*8|24407 ins/s|26.07|0.75859|0.92871
 
 ### Preprocessing with Nvidia DALI
 
diff --git a/PaddleCV/image_classification/build_model.py b/PaddleCV/image_classification/build_model.py
index 04c0d0ee..a0dfd131 100644
--- a/PaddleCV/image_classification/build_model.py
+++ b/PaddleCV/image_classification/build_model.py
@@ -35,8 +35,11 @@ def _calc_label_smoothing_loss(softmax_out, label, class_dim, epsilon):
 def _basic_model(data, model, args, is_train):
     image = data[0]
     label = data[1]
-
-    net_out = model.net(input=image, class_dim=args.class_dim)
+    if args.model == "ResNet50":
+        image_in = fluid.layers.transpose(image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image
+        net_out = model.net(input=image_in, class_dim=args.class_dim, data_format=args.data_format)
+    else:
+        net_out = model.net(input=image, class_dim=args.class_dim)
     softmax_out = fluid.layers.softmax(net_out, use_cudnn=False)
 
     if is_train and args.use_label_smoothing:
@@ -88,7 +91,11 @@ def _mixup_model(data, model, args, is_train):
     y_b = data[2]
     lam = data[3]
 
-    net_out = model.net(input=image, class_dim=args.class_dim)
+    if args.model == "ResNet50":
+        image_in = fluid.layers.transpose(image, [0, 2, 3, 1]) if args.data_format == 'NHWC' else image
+        net_out = model.net(input=image_in, class_dim=args.class_dim, data_format=args.data_format)
+    else:
+        net_out = model.net(input=image, class_dim=args.class_dim)
     softmax_out = fluid.layers.softmax(net_out, use_cudnn=False)
     if not args.use_label_smoothing:
         loss_a = fluid.layers.cross_entropy(input=softmax_out, label=y_a)
diff --git a/PaddleCV/image_classification/models/resnet.py b/PaddleCV/image_classification/models/resnet.py
index bb68d018..fcf45358 100644
--- a/PaddleCV/image_classification/models/resnet.py
+++ b/PaddleCV/image_classification/models/resnet.py
@@ -31,7 +31,7 @@ class ResNet():
     def __init__(self, layers=50):
         self.layers = layers
 
-    def net(self, input, class_dim=1000):
+    def net(self, input, class_dim=1000, data_format="NCHW"):
         layers = self.layers
         supported_layers = [18, 34, 50, 101, 152]
         assert layers in supported_layers, \
@@ -53,13 +53,15 @@ class ResNet():
             filter_size=7,
             stride=2,
             act='relu',
-            name="conv1")
+            name="conv1",
+            data_format=data_format)
         conv = fluid.layers.pool2d(
             input=conv,
             pool_size=3,
             pool_stride=2,
             pool_padding=1,
-            pool_type='max')
+            pool_type='max',
+            data_format=data_format)
         if layers >= 50:
             for block in range(len(depth)):
                 for i in range(depth[block]):
@@ -74,10 +76,11 @@ class ResNet():
                         input=conv,
                         num_filters=num_filters[block],
                         stride=2 if i == 0 and block != 0 else 1,
-                        name=conv_name)
+                        name=conv_name,
+                        data_format=data_format)
 
             pool = fluid.layers.pool2d(
-                input=conv, pool_type='avg', global_pooling=True)
+                input=conv, pool_type='avg', global_pooling=True, data_format=data_format)
             stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
             out = fluid.layers.fc(
                 input=pool,
@@ -93,10 +96,11 @@ class ResNet():
                         num_filters=num_filters[block],
                         stride=2 if i == 0 and block != 0 else 1,
                         is_first=block == i == 0,
-                        name=conv_name)
+                        name=conv_name,
+                        data_format=data_format)
 
             pool = fluid.layers.pool2d(
-                input=conv, pool_type='avg', global_pooling=True)
+                input=conv, pool_type='avg', global_pooling=True, data_format=data_format)
             stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
             out = fluid.layers.fc(
                 input=pool,
@@ -112,7 +116,8 @@ class ResNet():
                       stride=1,
                       groups=1,
                       act=None,
-                      name=None):
+                      name=None,
+                      data_format='NCHW'):
         conv = fluid.layers.conv2d(
             input=input,
             num_filters=num_filters,
@@ -123,7 +128,8 @@ class ResNet():
             act=None,
             param_attr=ParamAttr(name=name + "_weights"),
             bias_attr=False,
-            name=name + '.conv2d.output.1')
+            name=name + '.conv2d.output.1',
+            data_format=data_format)
 
         if name == "conv1":
             bn_name = "bn_" + name
@@ -136,62 +142,72 @@ class ResNet():
             param_attr=ParamAttr(name=bn_name + '_scale'),
             bias_attr=ParamAttr(bn_name + '_offset'),
             moving_mean_name=bn_name + '_mean',
-            moving_variance_name=bn_name + '_variance', )
+            moving_variance_name=bn_name + '_variance',
+            data_layout=data_format)
 
-    def shortcut(self, input, ch_out, stride, is_first, name):
-        ch_in = input.shape[1]
+    def shortcut(self, input, ch_out, stride, is_first, name, data_format):
+        if data_format == 'NCHW':
+            ch_in = input.shape[1]
+        else:
+            ch_in = input.shape[-1]
         if ch_in != ch_out or stride != 1 or is_first == True:
-            return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
+            return self.conv_bn_layer(input, ch_out, 1, stride, name=name, data_format=data_format)
         else:
             return input
 
-    def bottleneck_block(self, input, num_filters, stride, name):
+    def bottleneck_block(self, input, num_filters, stride, name, data_format):
         conv0 = self.conv_bn_layer(
             input=input,
             num_filters=num_filters,
             filter_size=1,
             act='relu',
-            name=name + "_branch2a")
+            name=name + "_branch2a",
+            data_format=data_format)
         conv1 = self.conv_bn_layer(
             input=conv0,
             num_filters=num_filters,
             filter_size=3,
             stride=stride,
             act='relu',
-            name=name + "_branch2b")
+            name=name + "_branch2b",
+            data_format=data_format)
         conv2 = self.conv_bn_layer(
             input=conv1,
             num_filters=num_filters * 4,
             filter_size=1,
             act=None,
-            name=name + "_branch2c")
+            name=name + "_branch2c",
+            data_format=data_format)
 
         short = self.shortcut(
             input,
             num_filters * 4,
             stride,
             is_first=False,
-            name=name + "_branch1")
+            name=name + "_branch1",
+            data_format=data_format)
 
         return fluid.layers.elementwise_add(
             x=short, y=conv2, act='relu', name=name + ".add.output.5")
 
-    def basic_block(self, input, num_filters, stride, is_first, name):
+    def basic_block(self, input, num_filters, stride, is_first, name, data_format):
         conv0 = self.conv_bn_layer(
             input=input,
             num_filters=num_filters,
             filter_size=3,
             act='relu',
             stride=stride,
-            name=name + "_branch2a")
+            name=name + "_branch2a",
+            data_format=data_format)
         conv1 = self.conv_bn_layer(
             input=conv0,
             num_filters=num_filters,
             filter_size=3,
             act=None,
-            name=name + "_branch2b")
+            name=name + "_branch2b",
+            data_format=data_format)
         short = self.shortcut(
-            input, num_filters, stride, is_first, name=name + "_branch1")
+            input, num_filters, stride, is_first, name=name + "_branch1", data_format=data_format)
         return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')
 
 
diff --git a/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh
new file mode 100755
index 00000000..6ebd5c01
--- /dev/null
+++ b/PaddleCV/image_classification/scripts/train/ResNet50_fp16.sh
@@ -0,0 +1,40 @@
+#!/bin/bash -ex
+
+export FLAGS_conv_workspace_size_limit=4000 #MB
+export FLAGS_cudnn_exhaustive_search=1
+export FLAGS_cudnn_batchnorm_spatial_persistent=1
+
+DATA_DIR="Your image dataset path, e.g. /work/datasets/ILSVRC2012/"
+
+DATA_FORMAT="NHWC"
+USE_FP16=true #whether to use float16
+USE_DALI=true
+
+if ${USE_DALI}; then
+    export FLAGS_fraction_of_gpu_memory_to_use=0.8
+fi
+
+python train.py \
+       --model=ResNet50 \
+       --data_dir=${DATA_DIR} \
+       --batch_size=256 \
+       --total_images=1281167 \
+       --image_shape 3 224 224 \
+       --class_dim=1000 \
+       --print_step=10 \
+       --model_save_dir=output/ \
+       --lr_strategy=piecewise_decay \
+       --use_fp16=${USE_FP16} \
+       --scale_loss=128.0 \
+       --use_dynamic_loss_scaling=true \
+       --data_format=${DATA_FORMAT} \
+       --fuse_elewise_add_act_ops=true \
+       --fuse_bn_act_ops=true \
+       --validate=true \
+       --is_profiler=false \
+       --profiler_path=profile/ \
+       --reader_thread=10 \
+       --reader_buf_size=4000 \
+       --use_dali=${USE_DALI} \
+       --lr=0.1
+
diff --git a/PaddleCV/image_classification/utils/utility.py b/PaddleCV/image_classification/utils/utility.py
index 37794603..5abd0c7d 100644
--- a/PaddleCV/image_classification/utils/utility.py
+++ b/PaddleCV/image_classification/utils/utility.py
@@ -65,7 +65,7 @@ def print_arguments(args):
 
 
 def add_arguments(argname, type, default, help, argparser, **kwargs):
-    """Add argparse's argument. 
+    """Add argparse's argument.
 
     Usage:
 
@@ -87,7 +87,7 @@ def add_arguments(argname, type, default, help, argparser, **kwargs):
 def parse_args():
     """Add arguments
 
-    Returns: 
+    Returns:
         all training args
     """
     parser = argparse.ArgumentParser(description=__doc__)
@@ -142,6 +142,9 @@ def parse_args():
     add_arg('use_fp16',                 bool,   False,                  "Whether to enable half precision training with fp16." )
     add_arg('scale_loss',               float,  1.0,                    "The value of scale_loss for fp16." )
     add_arg('use_dynamic_loss_scaling', bool,   True,                   "Whether to use dynamic loss scaling.")
+    add_arg('data_format',              str,    "NCHW",                 "Tensor data format when training.")
+    add_arg('fuse_elewise_add_act_ops', bool,   False,                  "Whether to use elementwise_act fusion.")
+    add_arg('fuse_bn_act_ops',          bool,   False,                  "Whether to use batch_norm and act fusion.")
 
     add_arg('use_label_smoothing',      bool,   False,                  "Whether to use label_smoothing")
     add_arg('label_smoothing_epsilon',  float,  0.1,                    "The value of label_smoothing_epsilon parameter")
@@ -168,7 +171,7 @@ def parse_args():
 
 
 def check_gpu():
-    """   
+    """
     Log error and exit when set use_gpu=true in paddlepaddle
     cpu ver sion.
     """
@@ -364,12 +367,12 @@ def create_data_loader(is_train, args):
     Usage:
         Using mixup process in training, it will return 5 results, include data_loader, image, y_a(label), y_b(label) and lamda, or it will return 3 results, include data_loader, image, and label.
 
-    Args: 
+    Args:
         is_train: mode
         args: arguments
 
     Returns:
-        data_loader and the input data of net, 
+        data_loader and the input data of net,
     """
     image_shape = args.image_shape
     feed_image = fluid.data(
@@ -428,7 +431,7 @@ def print_info(info_mode,
         time_info: time infomation
         info_mode: mode
     """
-    #XXX: Use specific name to choose pattern, not the length of metrics. 
+    #XXX: Use specific name to choose pattern, not the length of metrics.
     if info_mode == "batch":
         if batch_id % print_step == 0:
             #if isinstance(metrics,np.ndarray):
@@ -518,6 +521,8 @@ def best_strategy_compiled(args,
         return program
     else:
         build_strategy = fluid.compiler.BuildStrategy()
+        build_strategy.fuse_bn_act_ops = args.fuse_bn_act_ops
+        build_strategy.fuse_elewise_add_act_ops = args.fuse_elewise_add_act_ops
 
         exec_strategy = fluid.ExecutionStrategy()
 
-- 
GitLab