From 22b8805bb5a6a6ee28fb228fc4ce7fee4791bef1 Mon Sep 17 00:00:00 2001
From: chajchaj <57249073+chajchaj@users.noreply.github.com>
Date: Wed, 12 Feb 2020 01:06:42 +0800
Subject: [PATCH] add features: train with cpu, save and load checkpoint
 (#4259)

---
 dygraph/mobilenet/RADEME.md                 | 44 ----------
 dygraph/mobilenet/README.md                 | 67 +++++++++++++++
 dygraph/mobilenet/mobilenet_v1.py           |  5 +-
 dygraph/mobilenet/mobilenet_v2.py           |  8 +-
 dygraph/mobilenet/run_cpu_v1.sh             |  1 +
 dygraph/mobilenet/run_cpu_v2.sh             |  1 +
 dygraph/mobilenet/run_mul_v1.sh             |  2 +-
 dygraph/mobilenet/run_mul_v1_checkpoint.sh  |  2 +
 dygraph/mobilenet/run_mul_v2.sh             |  2 +-
 dygraph/mobilenet/run_mul_v2_checkpoint.sh  |  2 +
 dygraph/mobilenet/run_sing_v1.sh            |  2 +-
 dygraph/mobilenet/run_sing_v1_checkpoint.sh |  2 +
 dygraph/mobilenet/run_sing_v2.sh            |  2 +-
 dygraph/mobilenet/run_sing_v2_checkpoint.sh |  2 +
 dygraph/mobilenet/train.py                  | 94 ++++++++++++++-------
 15 files changed, 149 insertions(+), 87 deletions(-)
 delete mode 100644 dygraph/mobilenet/RADEME.md
 create mode 100644 dygraph/mobilenet/README.md
 create mode 100644 dygraph/mobilenet/run_cpu_v1.sh
 create mode 100644 dygraph/mobilenet/run_cpu_v2.sh
 create mode 100644 dygraph/mobilenet/run_mul_v1_checkpoint.sh
 create mode 100644 dygraph/mobilenet/run_mul_v2_checkpoint.sh
 create mode 100644 dygraph/mobilenet/run_sing_v1_checkpoint.sh
 create mode 100644 dygraph/mobilenet/run_sing_v2_checkpoint.sh

diff --git a/dygraph/mobilenet/RADEME.md b/dygraph/mobilenet/RADEME.md
deleted file mode 100644
index beee2f1b..00000000
--- a/dygraph/mobilenet/RADEME.md
+++ /dev/null
@@ -1,44 +0,0 @@
-**模型简介**
-
-图像分类是计算机视觉的重要领域，它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果，同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型，本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。
-
-**代码结构**
-
-    ├── run_mul_v1.sh      # 多卡训练启动脚本_v1
-    ├── run_mul_v2.sh      # 多卡训练启动脚本_v2
-    ├── run_sing_v1.sh     # 单卡训练启动脚本_v1
-    ├── run_sing_v2.sh     # 单卡训练启动脚本_v2
-    ├── train.py           # 训练入口
-    ├── mobilenet_v1.py    # 网络结构v1
-    ├── mobilenet_v2.py    # 网络结构v2
-    ├── reader.py          # 数据reader
-    ├── utils              # 基础工具目录
-
-**数据准备**
-
-请参考：https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification
-
-**模型训练**
-
-若使用4卡训练，启动方式如下:
-
-    bash run_mul_v1.sh
-    bash run_mul_v2.sh
-若使用单卡训练，启动方式如下:
-
-    bash run_sing_v1.sh
-    bash run_sing_v2.sh
-
-**模型精度**
-
-    Model         Top-1      Top-5
-    
-    MobileNetV1    0.707     0.895
-    
-    MobileNetV2    0.626     0.845
-
-**参考论文**
-
-MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam
-
-MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
diff --git a/dygraph/mobilenet/README.md b/dygraph/mobilenet/README.md
new file mode 100644
index 00000000..c2e0477e
--- /dev/null
+++ b/dygraph/mobilenet/README.md
@@ -0,0 +1,67 @@
+**模型简介**
+
+图像分类是计算机视觉的重要领域，它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果，同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型，本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。
+
+**代码结构**
+
+    ├── run_mul_v1.sh                 # 多卡训练启动脚本_v1
+    ├── run_mul_v1_checkpoint.sh      # 加载checkpoint多卡训练启动脚本_v1
+    ├── run_mul_v2.sh                 # 多卡训练启动脚本_v2
+    ├── run_mul_v2_checkpoint.sh      # 加载checkpoint多卡训练启动脚本_v2
+    ├── run_sing_v1.sh                # 单卡训练启动脚本_v1
+    ├── run_sing_v1_checkpoint.sh     # 加载checkpoint单卡训练启动脚本_v1
+    ├── run_sing_v2.sh                # 单卡训练启动脚本_v2
+    ├── run_sing_v2_checkpoint.sh     # 加载checkpoint单卡训练启动脚本_v2
+    ├── run_cpu_v1.sh                 # CPU训练启动脚本_v1
+    ├── run_cpu_v2.sh                 # CPU训练启动脚本_v2
+    ├── train.py                      # 训练入口
+    ├── mobilenet_v1.py               # 网络结构v1
+    ├── mobilenet_v2.py               # 网络结构v2
+    ├── reader.py                     # 数据reader
+    ├── utils                         # 基础工具目录
+
+**数据准备**
+
+请参考：https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification
+
+**模型训练**
+
+若使用4卡训练，启动方式如下:
+
+    bash run_mul_v1.sh
+    bash run_mul_v2.sh
+
+若使用单卡训练，启动方式如下:
+
+    bash run_sing_v1.sh
+    bash run_sing_v2.sh
+
+若使用CPU训练，启动方式如下:
+
+    bash run_cpu_v1.sh
+    bash run_cpu_v2.sh
+
+训练过程中,checkpoint会保存在参数model_save_dir指定的文件夹中,我们支持加载checkpoint继续训练.
+加载checkpoint使用4卡训练，启动方式如下:
+
+    bash run_mul_v1_checkpoint.sh
+    bash run_mul_v2_checkpoint.sh
+
+加载checkpoint使用单卡训练，启动方式如下:
+
+    bash run_sing_v1_checkpoint.sh
+    bash run_sing_v2_checkpoint.sh
+
+**模型性能**
+
+    Model          Top-1(单卡/4卡)    Top-5(单卡/4卡)    收敛时间(单卡/4卡)
+    
+    MobileNetV1    0.707/0.711        0.897/0.899        116小时/30.9小时
+    
+    MobileNetV2    0.708/0.724        0.899/0.906        227.8小时/60.8小时
+
+**参考论文**
+
+MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam
+
+MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
diff --git a/dygraph/mobilenet/mobilenet_v1.py b/dygraph/mobilenet/mobilenet_v1.py
index 56c12b9a..e3a5a94e 100644
--- a/dygraph/mobilenet/mobilenet_v1.py
+++ b/dygraph/mobilenet/mobilenet_v1.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+#order: standard library, third party, local library 
 import os
 import time
 import sys
+import math
 import numpy as np
 import argparse
-import ast
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
@@ -26,8 +27,6 @@ from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
 from paddle.fluid import framework
-import math
-import sys
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
diff --git a/dygraph/mobilenet/mobilenet_v2.py b/dygraph/mobilenet/mobilenet_v2.py
index 2466d430..6da031f2 100644
--- a/dygraph/mobilenet/mobilenet_v2.py
+++ b/dygraph/mobilenet/mobilenet_v2.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+#order: standard library, third party, local library 
 import os
-import numpy as np
 import time
-import sys
+import math
 import sys
 import numpy as np
 import argparse
-import ast
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
@@ -27,11 +26,8 @@ from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
 from paddle.fluid.dygraph.base import to_variable
-
 from paddle.fluid import framework
 
-import math
-import sys
 
 
 class ConvBNLayer(fluid.dygraph.Layer):
diff --git a/dygraph/mobilenet/run_cpu_v1.sh b/dygraph/mobilenet/run_cpu_v1.sh
new file mode 100644
index 00000000..81de4df3
--- /dev/null
+++ b/dygraph/mobilenet/run_cpu_v1.sh
@@ -0,0 +1 @@
+python3 train.py    --use_gpu=False  --batch_size=64        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
diff --git a/dygraph/mobilenet/run_cpu_v2.sh b/dygraph/mobilenet/run_cpu_v2.sh
new file mode 100644
index 00000000..4c18c006
--- /dev/null
+++ b/dygraph/mobilenet/run_cpu_v2.sh
@@ -0,0 +1 @@
+python3 train.py  --use_gpu=False --batch_size=64      --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=/ssd9/chaj//data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
diff --git a/dygraph/mobilenet/run_mul_v1.sh b/dygraph/mobilenet/run_mul_v1.sh
index d84e4d1d..fa48ef5f 100644
--- a/dygraph/mobilenet/run_mul_v1.sh
+++ b/dygraph/mobilenet/run_mul_v1.sh
@@ -1,2 +1,2 @@
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256     --reader_thread=8    --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1
+python3 -m paddle.distributed.launch --log_dir ./mylog.v1 train.py --use_data_parallel 1 --batch_size=256     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1  --model_save_dir=output.v1.mul/ --num_epochs=120 
diff --git a/dygraph/mobilenet/run_mul_v1_checkpoint.sh b/dygraph/mobilenet/run_mul_v1_checkpoint.sh
new file mode 100644
index 00000000..6b511f19
--- /dev/null
+++ b/dygraph/mobilenet/run_mul_v1_checkpoint.sh
@@ -0,0 +1,2 @@
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch --log_dir ./mylog.v1.checkpoint train.py --use_data_parallel 1 --batch_size=256     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1  --model_save_dir=output.v1.mul.checkpoint/ --num_epochs=120 --checkpoint=./output.v1.mul/_mobilenet_v1_epoch50 
diff --git a/dygraph/mobilenet/run_mul_v2.sh b/dygraph/mobilenet/run_mul_v2.sh
index a3f9991e..485cad36 100644
--- a/dygraph/mobilenet/run_mul_v2.sh
+++ b/dygraph/mobilenet/run_mul_v2.sh
@@ -1,2 +1,2 @@
 export CUDA_VISIBLE_DEVICES=0,1,2,3
-python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256     --reader_thread=8    --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV2
+python3 -m paddle.distributed.launch --log_dir ./mylog.v2 train.py --use_data_parallel 1 --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
diff --git a/dygraph/mobilenet/run_mul_v2_checkpoint.sh b/dygraph/mobilenet/run_mul_v2_checkpoint.sh
new file mode 100644
index 00000000..2b1b5587
--- /dev/null
+++ b/dygraph/mobilenet/run_mul_v2_checkpoint.sh
@@ -0,0 +1,2 @@
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch --log_dir ./mylog.v2.checkpoint train.py --use_data_parallel 1 --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul.checkpoint/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2 --checkpoint=./output.v2.mul/_mobilenet_v2_epoch50
diff --git a/dygraph/mobilenet/run_sing_v1.sh b/dygraph/mobilenet/run_sing_v1.sh
index 3e480faa..c4fef298 100644
--- a/dygraph/mobilenet/run_sing_v1.sh
+++ b/dygraph/mobilenet/run_sing_v1.sh
@@ -1,2 +1,2 @@
 export CUDA_VISIBLE_DEVICES=0
-python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
+python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
diff --git a/dygraph/mobilenet/run_sing_v1_checkpoint.sh b/dygraph/mobilenet/run_sing_v1_checkpoint.sh
new file mode 100644
index 00000000..47d68d96
--- /dev/null
+++ b/dygraph/mobilenet/run_sing_v1_checkpoint.sh
@@ -0,0 +1,2 @@
+export CUDA_VISIBLE_DEVICES=0
+python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1   --checkpoint=./output.v1.sing/_mobilenet_v1_epoch50 
diff --git a/dygraph/mobilenet/run_sing_v2.sh b/dygraph/mobilenet/run_sing_v2.sh
index 9db7a20f..f747ee5e 100644
--- a/dygraph/mobilenet/run_sing_v2.sh
+++ b/dygraph/mobilenet/run_sing_v2.sh
@@ -1,2 +1,2 @@
 export CUDA_VISIBLE_DEVICES=0
-python3 train.py      --batch_size=128        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012  --model=MobileNetV2
+python3 train.py  --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
diff --git a/dygraph/mobilenet/run_sing_v2_checkpoint.sh b/dygraph/mobilenet/run_sing_v2_checkpoint.sh
new file mode 100644
index 00000000..ed77b221
--- /dev/null
+++ b/dygraph/mobilenet/run_sing_v2_checkpoint.sh
@@ -0,0 +1,2 @@
+export CUDA_VISIBLE_DEVICES=0
+python3 train.py  --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2   --checkpoint=./output.v2.sing/_mobilenet_v2_epoch50 
diff --git a/dygraph/mobilenet/train.py b/dygraph/mobilenet/train.py
index 42648b3e..254279ba 100644
--- a/dygraph/mobilenet/train.py
+++ b/dygraph/mobilenet/train.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,35 +12,24 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from mobilenet_v1 import *
-from mobilenet_v2 import *
+#order: standard library, third party, local library 
 import os
-import numpy as np
 import time
 import sys
-import sys
-import numpy as np
+import math
 import argparse
-import ast
+import numpy as np
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid.initializer import MSRA
 from paddle.fluid.param_attr import ParamAttr
 from paddle.fluid.layer_helper import LayerHelper
-#from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
 from paddle.fluid.dygraph.base import to_variable
-
 from paddle.fluid import framework
-
-import math
-import sys
 import reader
 from utils import *
-
-IMAGENET1000 = 1281167
-base_lr = 0.1
-momentum_rate = 0.9
-l2_decay = 1e-4
+from mobilenet_v1 import *
+from mobilenet_v2 import *
 
 args = parse_args()
 if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
@@ -56,7 +45,7 @@ def eval(net, test_data_loader, eop):
     for img, label in test_data_loader():
         t1 = time.time()
         label = to_variable(label.numpy().astype('int64').reshape(
-            int(args.batch_size / paddle.fluid.core.get_cuda_device_count()),
+            int(args.batch_size // paddle.fluid.core.get_cuda_device_count()),
             1))
         out = net(img)
         softmax_out = fluid.layers.softmax(out, use_cudnn=False)
@@ -80,10 +69,14 @@ def eval(net, test_data_loader, eop):
 
 
 def train_mobilenet():
-    epoch = args.num_epochs
-    place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
-        if args.use_data_parallel else fluid.CUDAPlace(0)
+    if not args.use_gpu:
+        place = fluid.CPUPlace()
+    elif not args.use_data_parallel:
+        place = fluid.CUDAPlace(0)
+    else:
+        place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
     with fluid.dygraph.guard(place):
+        # 1. init net and optimizer
         if args.ce:
             print("ce mode")
             seed = 33
@@ -93,13 +86,12 @@ def train_mobilenet():
         if args.use_data_parallel:
             strategy = fluid.dygraph.parallel.prepare_context()
 
-        net = None
         if args.model == "MobileNetV1":
-            net = MobileNetV1(class_dim=args.class_dim)
-            para_name = 'mobilenet_v1_params'
+            net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
+            model_path_pre = 'mobilenet_v1'
         elif args.model == "MobileNetV2":
             net = MobileNetV2(class_dim=args.class_dim, scale=1.0)
-            para_name = 'mobilenet_v2_params'
+            model_path_pre = 'mobilenet_v2'
         else:
             print(
                 "wrong model name, please try model = MobileNetV1 or MobileNetV2"
@@ -109,6 +101,18 @@ def train_mobilenet():
         optimizer = create_optimizer(args=args, parameter_list=net.parameters())
         if args.use_data_parallel:
             net = fluid.dygraph.parallel.DataParallel(net, strategy)
+
+        # 2. load checkpoint
+        if args.checkpoint:
+            assert os.path.exists(args.checkpoint + ".pdparams"), \
+                "Given dir {}.pdparams not exist.".format(args.checkpoint)
+            assert os.path.exists(args.checkpoint + ".pdopt"), \
+                "Given dir {}.pdopt not exist.".format(args.checkpoint)
+            para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
+            net.set_dict(para_dict)
+            optimizer.set_dict(opti_dict)
+
+        # 3. reader
         train_data_loader, train_data = utility.create_data_loader(
             is_train=True, args=args)
         test_data_loader, test_data = utility.create_data_loader(
@@ -119,7 +123,9 @@ def train_mobilenet():
         test_reader = imagenet_reader.val(settings=args)
         train_data_loader.set_sample_list_generator(train_reader, place)
         test_data_loader.set_sample_list_generator(test_reader, place)
-        for eop in range(epoch):
+
+        # 4. train loop
+        for eop in range(args.num_epochs):
             if num_trainers > 1:
                 imagenet_reader.set_shuffle_seed(eop + (
                     args.random_seed if args.random_seed else 0))
@@ -130,13 +136,17 @@ def train_mobilenet():
             total_sample = 0
             batch_id = 0
             t_last = 0
+            # 4.1 for each batch, call net() , backward(), and minimize()
             for img, label in train_data_loader():
                 t1 = time.time()
                 label = to_variable(label.numpy().astype('int64').reshape(
-                    int(args.batch_size /
+                    int(args.batch_size //
                         paddle.fluid.core.get_cuda_device_count()), 1))
                 t_start = time.time()
+
+                # 4.1.1 call net()
                 out = net(img)
+
                 t_end = time.time()
                 softmax_out = fluid.layers.softmax(out, use_cudnn=False)
                 loss = fluid.layers.cross_entropy(
@@ -145,14 +155,20 @@ def train_mobilenet():
                 acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
                 acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
                 t_start_back = time.time()
+
+                # 4.1.2 call backward()
                 if args.use_data_parallel:
                     avg_loss = net.scale_loss(avg_loss)
                     avg_loss.backward()
                     net.apply_collective_grads()
                 else:
                     avg_loss.backward()
+
                 t_end_back = time.time()
+
+                # 4.1.3 call minimize()
                 optimizer.minimize(avg_loss)
+
                 net.clear_gradients()
                 t2 = time.time()
                 train_batch_elapse = t2 - t1
@@ -174,13 +190,31 @@ def train_mobilenet():
             print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
                   (eop, batch_id, total_loss / total_sample, \
                    total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
-            net.eval()
-            eval(net, test_data_loader, eop)
+   
+            # 4.2 save checkpoint
             save_parameters = (not args.use_data_parallel) or (
                 args.use_data_parallel and
                 fluid.dygraph.parallel.Env().local_rank == 0)
             if save_parameters:
-                fluid.save_dygraph(net.state_dict(), para_name)
+                if not os.path.isdir(args.model_save_dir):
+                    os.makedirs(args.model_save_dir)
+                model_path = os.path.join(
+                    args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop))
+                fluid.dygraph.save_dygraph(net.state_dict(), model_path)
+                fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
+
+            # 4.3 validation
+            net.eval()
+            eval(net, test_data_loader, eop)
+
+        # 5. save final results
+        save_parameters = (not args.use_data_parallel) or (
+            args.use_data_parallel and
+            fluid.dygraph.parallel.Env().local_rank == 0)
+        if save_parameters:
+            model_path = os.path.join(
+                args.model_save_dir, "_" + model_path_pre + "_final")
+            fluid.dygraph.save_dygraph(net.state_dict(), model_path)
 
 
 if __name__ == '__main__':
-- 
GitLab