From 79eac5d44551dbec2bc1eeebe05e2f135dbea84c Mon Sep 17 00:00:00 2001
From: root <root@bjyz-sys-gpu-kongming4.bjyz.baidu.com>
Date: Mon, 10 Feb 2020 10:38:40 +0000
Subject: [PATCH] add features: train with cpu, save and load checkpoint

---
 dygraph/mobilenet/README.md                 | 52 +++++++++++++++++++++
 dygraph/mobilenet/run_cpu_v1.sh             |  1 +
 dygraph/mobilenet/run_cpu_v2.sh             |  1 +
 dygraph/mobilenet/run_mul_v1_checkpoint.sh  |  2 +
 dygraph/mobilenet/run_mul_v2_checkpoint.sh  |  2 +
 dygraph/mobilenet/run_sing_v1_checkpoint.sh |  2 +
 dygraph/mobilenet/run_sing_v2_checkpoint.sh |  2 +
 7 files changed, 62 insertions(+)
 create mode 100644 dygraph/mobilenet/README.md
 create mode 100644 dygraph/mobilenet/run_cpu_v1.sh
 create mode 100644 dygraph/mobilenet/run_cpu_v2.sh
 create mode 100644 dygraph/mobilenet/run_mul_v1_checkpoint.sh
 create mode 100644 dygraph/mobilenet/run_mul_v2_checkpoint.sh
 create mode 100644 dygraph/mobilenet/run_sing_v1_checkpoint.sh
 create mode 100644 dygraph/mobilenet/run_sing_v2_checkpoint.sh

diff --git a/dygraph/mobilenet/README.md b/dygraph/mobilenet/README.md
new file mode 100644
index 00000000..5d3a3e64
--- /dev/null
+++ b/dygraph/mobilenet/README.md
@@ -0,0 +1,52 @@
+**模型简介**
+
+图像分类是计算机视觉的重要领域，它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果，同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型，本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。
+
+**代码结构**
+
+    ├── run_mul_v1.sh      # 多卡训练启动脚本_v1
+    ├── run_mul_v2.sh      # 多卡训练启动脚本_v2
+    ├── run_sing_v1.sh     # 单卡训练启动脚本_v1
+    ├── run_sing_v2.sh     # 单卡训练启动脚本_v2
+    ├── run_cpu_v1.sh      # CPU训练启动脚本_v1
+    ├── run_cpu_v2.sh      # CPU训练启动脚本_v2
+    ├── train.py           # 训练入口
+    ├── mobilenet_v1.py    # 网络结构v1
+    ├── mobilenet_v2.py    # 网络结构v2
+    ├── reader.py          # 数据reader
+    ├── utils              # 基础工具目录
+
+**数据准备**
+
+请参考：https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification
+
+**模型训练**
+
+若使用4卡训练，启动方式如下:
+
+    bash run_mul_v1.sh
+    bash run_mul_v2.sh
+若使用单卡训练，启动方式如下:
+
+    bash run_sing_v1.sh
+    bash run_sing_v2.sh
+
+若使用CPU训练，启动方式如下:
+
+    bash run_cpu_v1.sh
+    bash run_cpu_v2.sh
+
+
+**模型性能**
+
+    Model          Top-1(单卡/4卡)    Top-5(单卡/4卡)    收敛时间(单卡/4卡)
+    
+    MobileNetV1    0.707/0.711        0.897/0.899        116小时/30.9小时
+    
+    MobileNetV2    0.708/0.724        0.899/0.906        227.8小时/60.8小时
+
+**参考论文**
+
+MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam
+
+MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
diff --git a/dygraph/mobilenet/run_cpu_v1.sh b/dygraph/mobilenet/run_cpu_v1.sh
new file mode 100644
index 00000000..81de4df3
--- /dev/null
+++ b/dygraph/mobilenet/run_cpu_v1.sh
@@ -0,0 +1 @@
+python3 train.py    --use_gpu=False  --batch_size=64        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1 
diff --git a/dygraph/mobilenet/run_cpu_v2.sh b/dygraph/mobilenet/run_cpu_v2.sh
new file mode 100644
index 00000000..4c18c006
--- /dev/null
+++ b/dygraph/mobilenet/run_cpu_v2.sh
@@ -0,0 +1 @@
+python3 train.py  --use_gpu=False --batch_size=64      --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=/ssd9/chaj//data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2
diff --git a/dygraph/mobilenet/run_mul_v1_checkpoint.sh b/dygraph/mobilenet/run_mul_v1_checkpoint.sh
new file mode 100644
index 00000000..6b511f19
--- /dev/null
+++ b/dygraph/mobilenet/run_mul_v1_checkpoint.sh
@@ -0,0 +1,2 @@
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch --log_dir ./mylog.v1.checkpoint train.py --use_data_parallel 1 --batch_size=256     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012 --l2_decay=3e-5  --model=MobileNetV1  --model_save_dir=output.v1.mul.checkpoint/ --num_epochs=120 --checkpoint=./output.v1.mul/_mobilenet_v1_epoch50 
diff --git a/dygraph/mobilenet/run_mul_v2_checkpoint.sh b/dygraph/mobilenet/run_mul_v2_checkpoint.sh
new file mode 100644
index 00000000..2b1b5587
--- /dev/null
+++ b/dygraph/mobilenet/run_mul_v2_checkpoint.sh
@@ -0,0 +1,2 @@
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python3 -m paddle.distributed.launch --log_dir ./mylog.v2.checkpoint train.py --use_data_parallel 1 --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul.checkpoint/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2 --checkpoint=./output.v2.mul/_mobilenet_v2_epoch50
diff --git a/dygraph/mobilenet/run_sing_v1_checkpoint.sh b/dygraph/mobilenet/run_sing_v1_checkpoint.sh
new file mode 100644
index 00000000..47d68d96
--- /dev/null
+++ b/dygraph/mobilenet/run_sing_v1_checkpoint.sh
@@ -0,0 +1,2 @@
+export CUDA_VISIBLE_DEVICES=0
+python3 train.py      --batch_size=256        --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1   --data_dir=./data/ILSVRC2012  --l2_decay=3e-5  --model=MobileNetV1   --checkpoint=./output.v1.sing/_mobilenet_v1_epoch50 
diff --git a/dygraph/mobilenet/run_sing_v2_checkpoint.sh b/dygraph/mobilenet/run_sing_v2_checkpoint.sh
new file mode 100644
index 00000000..ed77b221
--- /dev/null
+++ b/dygraph/mobilenet/run_sing_v2_checkpoint.sh
@@ -0,0 +1,2 @@
+export CUDA_VISIBLE_DEVICES=0
+python3 train.py  --batch_size=500     --total_images=1281167    --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1  --num_epochs=240  --data_dir=./data/ILSVRC2012 --l2_decay=4e-5  --model=MobileNetV2   --checkpoint=./output.v2.sing/_mobilenet_v2_epoch50 
-- 
GitLab