From 5d77480c7e51ba4da4221d79353aa30d799b98de Mon Sep 17 00:00:00 2001
From: chengxianbin <chengxianbin@huawei.com>
Date: Mon, 25 May 2020 10:34:46 +0800
Subject: [PATCH] modify yolov3&ssd shell script

---
 example/ssd_coco2017/run_distribute_train.sh  | 48 ++++++++++++----
 example/ssd_coco2017/train.py                 | 18 +++---
 .../yolov3_coco2017/run_distribute_train.sh   | 55 ++++++++++++++-----
 .../yolov3_coco2017/run_standalone_train.sh   | 25 +++++++--
 example/yolov3_coco2017/train.py              | 16 ++++--
 5 files changed, 121 insertions(+), 41 deletions(-)

diff --git a/example/ssd_coco2017/run_distribute_train.sh b/example/ssd_coco2017/run_distribute_train.sh
index 4c1049fcc..bd8519be4 100644
--- a/example/ssd_coco2017/run_distribute_train.sh
+++ b/example/ssd_coco2017/run_distribute_train.sh
@@ -14,13 +14,20 @@
 # limitations under the License.
 # ============================================================================
 
-echo "=============================================================================================================="
+echo "================================================================================================================="
 echo "Please run the scipt as: "
-echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDSPORE_HCCL_CONFIG_PATH"
-echo "for example: sh run_distribute_train.sh 8 150 coco /data/hccl.json"
+echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
+echo "for example: sh run_distribute_train.sh 8 350 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
 echo "It is better to use absolute path."
 echo "The learning rate is 0.4 as default, if you want other lr, please change the value in this script."
-echo "=============================================================================================================="
+echo "================================================================================================================="
+
+if [ $# != 4 ] && [ $# != 6 ]
+then
+    echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [DATASET] \
+[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
+    exit 1
+fi
 
 # Before start distribute train, first create mindrecord files.
 python train.py --only_create_dataset=1
@@ -30,6 +37,8 @@ echo "After running the scipt, the network runs in the background. The log will
 export RANK_SIZE=$1
 EPOCH_SIZE=$2
 DATASET=$3
+PRE_TRAINED=$5
+PRE_TRAINED_EPOCH_SIZE=$6
 export MINDSPORE_HCCL_CONFIG_PATH=$4
 
 
@@ -43,12 +52,29 @@ do
     export RANK_ID=$i
     echo "start training for rank $i, device $DEVICE_ID"
     env > env.log
-    python ../train.py  \
-    --distribute=1  \
-    --lr=0.4 \
-    --dataset=$DATASET \
-    --device_num=$RANK_SIZE  \
-    --device_id=$DEVICE_ID  \
-    --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
+    if [ $# == 4 ]
+    then
+        python ../train.py  \
+        --distribute=1  \
+        --lr=0.4 \
+        --dataset=$DATASET \
+        --device_num=$RANK_SIZE  \
+        --device_id=$DEVICE_ID  \
+        --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
+    fi
+
+    if [ $# == 6 ]
+    then
+        python ../train.py  \
+        --distribute=1  \
+        --lr=0.4 \
+        --dataset=$DATASET \
+        --device_num=$RANK_SIZE  \
+        --device_id=$DEVICE_ID  \
+        --pre_trained=$PRE_TRAINED \
+        --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
+        --epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
+    fi
+
     cd ../
 done
diff --git a/example/ssd_coco2017/train.py b/example/ssd_coco2017/train.py
index 75f9a6d31..9347bf61c 100644
--- a/example/ssd_coco2017/train.py
+++ b/example/ssd_coco2017/train.py
@@ -88,6 +88,7 @@ def main():
     parser.add_argument("--epoch_size", type=int, default=70, help="Epoch size, default is 70.")
     parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
     parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.")
+    parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.")
     parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
     parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
     args_opt = parser.parse_args()
@@ -150,17 +151,20 @@ def main():
         ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
         ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=None, config=ckpt_config)
 
-        lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=args_opt.lr,
-                           warmup_epochs=max(args_opt.epoch_size // 20, 1),
-                           total_epochs=args_opt.epoch_size,
-                           steps_per_epoch=dataset_size))
-        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale)
-        net = TrainingWrapper(net, opt, loss_scale)
-
         if args_opt.pre_trained:
+            if args_opt.pre_trained_epoch_size <= 0:
+                raise KeyError("pre_trained_epoch_size must be greater than 0.")
             param_dict = load_checkpoint(args_opt.pre_trained)
             load_param_into_net(net, param_dict)
 
+        lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size,
+                           lr_init=0, lr_end=0, lr_max=args_opt.lr,
+                           warmup_epochs=max(350 // 20, 1),
+                           total_epochs=350,
+                           steps_per_epoch=dataset_size))
+        opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale)
+        net = TrainingWrapper(net, opt, loss_scale)
+
         callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
 
         model = Model(net)
diff --git a/example/yolov3_coco2017/run_distribute_train.sh b/example/yolov3_coco2017/run_distribute_train.sh
index 201f19ca1..0b764419d 100644
--- a/example/yolov3_coco2017/run_distribute_train.sh
+++ b/example/yolov3_coco2017/run_distribute_train.sh
@@ -14,18 +14,27 @@
 # limitations under the License.
 # ============================================================================
 
-echo "=============================================================================================================="
+echo "======================================================================================================================================================="
 echo "Please run the scipt as: "
-echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH"
-echo "for example: sh run_distribute_train.sh 8 100 /data/Mindrecord_train /data /data/train.txt /data/hccl.json"
+echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
+echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
 echo "It is better to use absolute path."
 echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
-echo "=============================================================================================================="
+echo "======================================================================================================================================================="
+
+if [ $# != 6 ] && [ $# != 8 ]
+then
+    echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \
+[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
+    exit 1
+fi
 
 EPOCH_SIZE=$2
 MINDRECORD_DIR=$3
 IMAGE_DIR=$4
 ANNO_PATH=$5
+PRE_TRAINED=$7
+PRE_TRAINED_EPOCH_SIZE=$8
 
 # Before start distribute train, first create mindrecord files.
 python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR  \
@@ -51,14 +60,34 @@ do
     export RANK_ID=$i
     echo "start training for rank $i, device $DEVICE_ID"
     env > env.log
-    taskset -c $cmdopt python ../train.py  \
-    --distribute=1  \
-    --lr=0.005 \
-    --device_num=$RANK_SIZE  \
-    --device_id=$DEVICE_ID  \
-    --mindrecord_dir=$MINDRECORD_DIR  \
-    --image_dir=$IMAGE_DIR  \
-    --epoch_size=$EPOCH_SIZE  \
-    --anno_path=$ANNO_PATH > log.txt 2>&1 &
+
+    if [ $# == 6 ]
+    then
+        taskset -c $cmdopt python ../train.py  \
+        --distribute=1  \
+        --lr=0.005 \
+        --device_num=$RANK_SIZE  \
+        --device_id=$DEVICE_ID  \
+        --mindrecord_dir=$MINDRECORD_DIR  \
+        --image_dir=$IMAGE_DIR  \
+        --epoch_size=$EPOCH_SIZE  \
+        --anno_path=$ANNO_PATH > log.txt 2>&1 &
+    fi
+
+    if [ $# == 8 ]
+    then
+        taskset -c $cmdopt python ../train.py  \
+        --distribute=1  \
+        --lr=0.005 \
+        --device_num=$RANK_SIZE  \
+        --device_id=$DEVICE_ID  \
+        --mindrecord_dir=$MINDRECORD_DIR  \
+        --image_dir=$IMAGE_DIR  \
+        --epoch_size=$EPOCH_SIZE  \
+        --pre_trained=$PRE_TRAINED \
+        --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
+        --anno_path=$ANNO_PATH > log.txt 2>&1 &
+    fi
+
     cd ../
 done
diff --git a/example/yolov3_coco2017/run_standalone_train.sh b/example/yolov3_coco2017/run_standalone_train.sh
index d91d53700..8bce45d89 100644
--- a/example/yolov3_coco2017/run_standalone_train.sh
+++ b/example/yolov3_coco2017/run_standalone_train.sh
@@ -14,10 +14,25 @@
 # limitations under the License.
 # ============================================================================
 
-echo "=============================================================================================================="
+echo "========================================================================================================================================="
 echo "Please run the scipt as: "
-echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH"
-echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt"
-echo "=============================================================================================================="
+echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
+echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)"
+echo "========================================================================================================================================="
 
-python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
+if [ $# != 5 ] && [ $# != 7 ]
+then
+    echo "Usage: sh run_standalone_train.sh [DEVICE_ID] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] \
+[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
+    exit 1
+fi
+
+if [ $# == 5 ]
+then
+    python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
+fi
+
+if [ $# == 7 ]
+then
+    python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7
+fi
diff --git a/example/yolov3_coco2017/train.py b/example/yolov3_coco2017/train.py
index 62329bf88..683d7bd78 100644
--- a/example/yolov3_coco2017/train.py
+++ b/example/yolov3_coco2017/train.py
@@ -71,6 +71,7 @@ def main():
     parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10")
     parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
     parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path")
+    parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size")
     parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
     parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
     parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_train",
@@ -133,14 +134,19 @@ def main():
         ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
         ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config)
 
-        lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=0, global_step=args_opt.epoch_size * dataset_size,
-                           decay_step=1000, decay_rate=0.95, steps=True))
-        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
-        net = TrainingWrapper(net, opt, loss_scale)
-
         if args_opt.pre_trained:
+            if args_opt.pre_trained_epoch_size <= 0:
+                raise KeyError("pre_trained_epoch_size must be greater than 0.")
             param_dict = load_checkpoint(args_opt.pre_trained)
             load_param_into_net(net, param_dict)
+        total_epoch_size = 60
+        if args_opt.distribute:
+            total_epoch_size = 160
+        lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size,
+                           global_step=total_epoch_size * dataset_size,
+                           decay_step=1000, decay_rate=0.95, steps=True))
+        opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
+        net = TrainingWrapper(net, opt, loss_scale)
 
         callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
 
-- 
GitLab