add features: train with cpu, save and load checkpoint (#4259)

22b8805b · chajchaj · GitHub · 9983e3a9 · 22b8805b · 22b8805b
14 changed file
--- a/dygraph/mobilenet/RADEME.md
+++ b/dygraph/mobilenet/RADEME.md
@@ -5,9 +5,15 @@
 **代码结构**

    ├── run_mul_v1.sh                 # 多卡训练启动脚本_v1
+    ├── run_mul_v1_checkpoint.sh      # 加载checkpoint多卡训练启动脚本_v1
    ├── run_mul_v2.sh                 # 多卡训练启动脚本_v2
+    ├── run_mul_v2_checkpoint.sh      # 加载checkpoint多卡训练启动脚本_v2
    ├── run_sing_v1.sh                # 单卡训练启动脚本_v1
+    ├── run_sing_v1_checkpoint.sh     # 加载checkpoint单卡训练启动脚本_v1
    ├── run_sing_v2.sh                # 单卡训练启动脚本_v2
+    ├── run_sing_v2_checkpoint.sh     # 加载checkpoint单卡训练启动脚本_v2
+    ├── run_cpu_v1.sh                 # CPU训练启动脚本_v1
+    ├── run_cpu_v2.sh                 # CPU训练启动脚本_v2
    ├── train.py                      # 训练入口
    ├── mobilenet_v1.py               # 网络结构v1
    ├── mobilenet_v2.py               # 网络结构v2
@@ -24,18 +30,35 @@

    bash run_mul_v1.sh
    bash run_mul_v2.sh
+
 若使用单卡训练，启动方式如下:

    bash run_sing_v1.sh
    bash run_sing_v2.sh

-**模型精度**
+若使用CPU训练，启动方式如下:
+
+    bash run_cpu_v1.sh
+    bash run_cpu_v2.sh
+
+训练过程中,checkpoint会保存在参数model_save_dir指定的文件夹中,我们支持加载checkpoint继续训练.
+加载checkpoint使用4卡训练，启动方式如下:
+
+    bash run_mul_v1_checkpoint.sh
+    bash run_mul_v2_checkpoint.sh
+
+加载checkpoint使用单卡训练，启动方式如下:
+
+    bash run_sing_v1_checkpoint.sh
+    bash run_sing_v2_checkpoint.sh
+
+**模型性能**

-    Model         Top-1      Top-5
+    Model          Top-1(单卡/4卡)    Top-5(单卡/4卡)    收敛时间(单卡/4卡)
    
-    MobileNetV1    0.707     0.895
+    MobileNetV1    0.707/0.711        0.897/0.899        116小时/30.9小时
    
-    MobileNetV2    0.626     0.845
+    MobileNetV2    0.708/0.724        0.899/0.906        227.8小时/60.8小时

 **参考论文**


--- a/dygraph/mobilenet/mobilenet_v1.py
+++ b/dygraph/mobilenet/mobilenet_v1.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+#order: standard library, third party, local library 
 import os
 import time
 import sys
+import math
 import numpy as np
 import argparse
-import ast
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
@@ -26,8 +27,6 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid import framework
-import math
-import sys


 class ConvBNLayer(fluid.dygraph.Layer):

--- a/dygraph/mobilenet/mobilenet_v2.py
+++ b/dygraph/mobilenet/mobilenet_v2.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+#order: standard library, third party, local library 
 import os
-import numpy as np
 import time
-import sys
+import math
 import sys
 import numpy as np
 import argparse
-import ast
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
@@ -27,11 +26,8 @@ from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
-
 from paddle.fluid import framework

-import math
-import sys


 class ConvBNLayer(fluid.dygraph.Layer):

--- a/dygraph/mobilenet/run_cpu_v1.sh
+++ b/dygraph/mobilenet/run_cpu_v1.sh
+python3 train.py    --use_gpu=False  --batch_size=64        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
--- a/dygraph/mobilenet/run_cpu_v2.sh
+++ b/dygraph/mobilenet/run_cpu_v2.sh
+python3 train.py  --use_gpu=False --batch_size=64      --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=/ssd9/chaj//data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
--- a/dygraph/mobilenet/run_mul_v1.sh
+++ b/dygraph/mobilenet/run_mul_v1.sh
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256     --reader_thread=8    --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1
+python3 -m paddle.distributed.launch --log_dir ./mylog.v1 train.py --use_data_parallel 1 --batch_size=256     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1  --model_save_dir=output.v1.mul/ --num_epochs=120 
--- a/dygraph/mobilenet/run_mul_v1_checkpoint.sh
+++ b/dygraph/mobilenet/run_mul_v1_checkpoint.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch --log_dir ./mylog.v1.checkpoint train.py --use_data_parallel 1 --batch_size=256     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1  --model_save_dir=output.v1.mul.checkpoint/ --num_epochs=120 --checkpoint=./output.v1.mul/_mobilenet_v1_epoch50 
--- a/dygraph/mobilenet/run_mul_v2.sh
+++ b/dygraph/mobilenet/run_mul_v2.sh
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256     --reader_thread=8    --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV2
+python3 -m paddle.distributed.launch --log_dir ./mylog.v2 train.py --use_data_parallel 1 --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
--- a/dygraph/mobilenet/run_mul_v2_checkpoint.sh
+++ b/dygraph/mobilenet/run_mul_v2_checkpoint.sh
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch --log_dir ./mylog.v2.checkpoint train.py --use_data_parallel 1 --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul.checkpoint/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2 --checkpoint=./output.v2.mul/_mobilenet_v2_epoch50
--- a/dygraph/mobilenet/run_sing_v1.sh
+++ b/dygraph/mobilenet/run_sing_v1.sh
 export CUDA_VISIBLE_DEVICES=0
-python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
+python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
--- a/dygraph/mobilenet/run_sing_v1_checkpoint.sh
+++ b/dygraph/mobilenet/run_sing_v1_checkpoint.sh
+export CUDA_VISIBLE_DEVICES=0
+python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1   --checkpoint=./output.v1.sing/_mobilenet_v1_epoch50 
--- a/dygraph/mobilenet/run_sing_v2.sh
+++ b/dygraph/mobilenet/run_sing_v2.sh
 export CUDA_VISIBLE_DEVICES=0
-python3 train.py      --batch_size=128        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012  --model=MobileNetV2
+python3 train.py  --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
--- a/dygraph/mobilenet/run_sing_v2_checkpoint.sh
+++ b/dygraph/mobilenet/run_sing_v2_checkpoint.sh
+export CUDA_VISIBLE_DEVICES=0
+python3 train.py  --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2   --checkpoint=./output.v2.sing/_mobilenet_v2_epoch50 
--- a/dygraph/mobilenet/train.py
+++ b/dygraph/mobilenet/train.py
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,35 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from mobilenet_v1 import *
-from mobilenet_v2 import *
+#order: standard library, third party, local library 
 import os
-import numpy as np
 import time
 import sys
-import sys
-import numpy as np
+import math
 import argparse
-import ast
+import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.layer_helper import LayerHelper
-#from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.dygraph.base import to_variable
-
 from paddle.fluid import framework
-
-import math
-import sys
 import reader
 from utils import *
-
-IMAGENET1000 = 1281167
-base_lr = 0.1
-momentum_rate = 0.9
-l2_decay = 1e-4
+from mobilenet_v1 import *
+from mobilenet_v2 import *

 args = parse_args()
 if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
@@ -56,7 +45,7 @@ def eval(net, test_data_loader, eop):
    for img, label in test_data_loader():
        t1 = time.time()
        label = to_variable(label.numpy().astype('int64').reshape(
-            int(args.batch_size / paddle.fluid.core.get_cuda_device_count()),
+            int(args.batch_size // paddle.fluid.core.get_cuda_device_count()),
            1))
        out = net(img)
        softmax_out = fluid.layers.softmax(out, use_cudnn=False)
@@ -80,10 +69,14 @@ def eval(net, test_data_loader, eop):


 def train_mobilenet():
-    epoch = args.num_epochs
-    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
-        if args.use_data_parallel else fluid.CUDAPlace(0)
+    if not args.use_gpu:
+        place = fluid.CPUPlace()
+    elif not args.use_data_parallel:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
    with fluid.dygraph.guard(place):
+        # 1. init net and optimizer
        if args.ce:
            print("ce mode")
            seed = 33
@@ -93,13 +86,12 @@ def train_mobilenet():
        if args.use_data_parallel:
            strategy = fluid.dygraph.parallel.prepare_context()

-        net = None
        if args.model == "MobileNetV1":
-            net = MobileNetV1(class_dim=args.class_dim)
-            para_name = 'mobilenet_v1_params'
+            net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
+            model_path_pre = 'mobilenet_v1'
        elif args.model == "MobileNetV2":
            net = MobileNetV2(class_dim=args.class_dim, scale=1.0)
-            para_name = 'mobilenet_v2_params'
+            model_path_pre = 'mobilenet_v2'
        else:
            print(
                "wrong model name, please try model = MobileNetV1 or MobileNetV2"
@@ -109,6 +101,18 @@ def train_mobilenet():
        optimizer = create_optimizer(args=args, parameter_list=net.parameters())
        if args.use_data_parallel:
            net = fluid.dygraph.parallel.DataParallel(net, strategy)
+
+        # 2. load checkpoint
+        if args.checkpoint:
+            assert os.path.exists(args.checkpoint + ".pdparams"), \
+                "Given dir {}.pdparams not exist.".format(args.checkpoint)
+            assert os.path.exists(args.checkpoint + ".pdopt"), \
+                "Given dir {}.pdopt not exist.".format(args.checkpoint)
+            para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
+            net.set_dict(para_dict)
+            optimizer.set_dict(opti_dict)
+
+        # 3. reader
        train_data_loader, train_data = utility.create_data_loader(
            is_train=True, args=args)
        test_data_loader, test_data = utility.create_data_loader(
@@ -119,7 +123,9 @@ def train_mobilenet():
        test_reader = imagenet_reader.val(settings=args)
        train_data_loader.set_sample_list_generator(train_reader, place)
        test_data_loader.set_sample_list_generator(test_reader, place)
-        for eop in range(epoch):
+
+        # 4. train loop
+        for eop in range(args.num_epochs):
            if num_trainers > 1:
                imagenet_reader.set_shuffle_seed(eop + (
                    args.random_seed if args.random_seed else 0))
@@ -130,13 +136,17 @@ def train_mobilenet():
            total_sample = 0
            batch_id = 0
            t_last = 0
+            # 4.1 for each batch, call net() , backward(), and minimize()
            for img, label in train_data_loader():
                t1 = time.time()
                label = to_variable(label.numpy().astype('int64').reshape(
-                    int(args.batch_size /
+                    int(args.batch_size //
                        paddle.fluid.core.get_cuda_device_count()), 1))
                t_start = time.time()
+
+                # 4.1.1 call net()
                out = net(img)
+
                t_end = time.time()
                softmax_out = fluid.layers.softmax(out, use_cudnn=False)
                loss = fluid.layers.cross_entropy(
@@ -145,14 +155,20 @@ def train_mobilenet():
                acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
                acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
                t_start_back = time.time()
+
+                # 4.1.2 call backward()
                if args.use_data_parallel:
                    avg_loss = net.scale_loss(avg_loss)
                    avg_loss.backward()
                    net.apply_collective_grads()
                else:
                    avg_loss.backward()
+
                t_end_back = time.time()
+
+                # 4.1.3 call minimize()
                optimizer.minimize(avg_loss)
+
                net.clear_gradients()
                t2 = time.time()
                train_batch_elapse = t2 - t1
@@ -174,13 +190,31 @@ def train_mobilenet():
            print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
                  (eop, batch_id, total_loss / total_sample, \
                   total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
+   
+            # 4.2 save checkpoint
+            save_parameters = (not args.use_data_parallel) or (
+                args.use_data_parallel and
+                fluid.dygraph.parallel.Env().local_rank == 0)
+            if save_parameters:
+                if not os.path.isdir(args.model_save_dir):
+                    os.makedirs(args.model_save_dir)
+                model_path = os.path.join(
+                    args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop))
+                fluid.dygraph.save_dygraph(net.state_dict(), model_path)
+                fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
+
+            # 4.3 validation
            net.eval()
            eval(net, test_data_loader, eop)
+
+        # 5. save final results
        save_parameters = (not args.use_data_parallel) or (
            args.use_data_parallel and
            fluid.dygraph.parallel.Env().local_rank == 0)
        if save_parameters:
-                fluid.save_dygraph(net.state_dict(), para_name)
+            model_path = os.path.join(
+                args.model_save_dir, "_" + model_path_pre + "_final")
+            fluid.dygraph.save_dygraph(net.state_dict(), model_path)


 if __name__ == '__main__':