From 5d77480c7e51ba4da4221d79353aa30d799b98de Mon Sep 17 00:00:00 2001 From: chengxianbin Date: Mon, 25 May 2020 10:34:46 +0800 Subject: [PATCH] modify yolov3&ssd shell script --- example/ssd_coco2017/run_distribute_train.sh | 48 ++++++++++++---- example/ssd_coco2017/train.py | 18 +++--- .../yolov3_coco2017/run_distribute_train.sh | 55 ++++++++++++++----- .../yolov3_coco2017/run_standalone_train.sh | 25 +++++++-- example/yolov3_coco2017/train.py | 16 ++++-- 5 files changed, 121 insertions(+), 41 deletions(-) diff --git a/example/ssd_coco2017/run_distribute_train.sh b/example/ssd_coco2017/run_distribute_train.sh index 4c1049fcc..bd8519be4 100644 --- a/example/ssd_coco2017/run_distribute_train.sh +++ b/example/ssd_coco2017/run_distribute_train.sh @@ -14,13 +14,20 @@ # limitations under the License. # ============================================================================ -echo "==============================================================================================================" +echo "=================================================================================================================" echo "Please run the scipt as: " -echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDSPORE_HCCL_CONFIG_PATH" -echo "for example: sh run_distribute_train.sh 8 150 coco /data/hccl.json" +echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" +echo "for example: sh run_distribute_train.sh 8 350 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)" echo "It is better to use absolute path." echo "The learning rate is 0.4 as default, if you want other lr, please change the value in this script." -echo "==============================================================================================================" +echo "=================================================================================================================" + +if [ $# != 4 ] && [ $# != 6 ] +then + echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [DATASET] \ +[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" + exit 1 +fi # Before start distribute train, first create mindrecord files. python train.py --only_create_dataset=1 @@ -30,6 +37,8 @@ echo "After running the scipt, the network runs in the background. The log will export RANK_SIZE=$1 EPOCH_SIZE=$2 DATASET=$3 +PRE_TRAINED=$5 +PRE_TRAINED_EPOCH_SIZE=$6 export MINDSPORE_HCCL_CONFIG_PATH=$4 @@ -43,12 +52,29 @@ do export RANK_ID=$i echo "start training for rank $i, device $DEVICE_ID" env > env.log - python ../train.py \ - --distribute=1 \ - --lr=0.4 \ - --dataset=$DATASET \ - --device_num=$RANK_SIZE \ - --device_id=$DEVICE_ID \ - --epoch_size=$EPOCH_SIZE > log.txt 2>&1 & + if [ $# == 4 ] + then + python ../train.py \ + --distribute=1 \ + --lr=0.4 \ + --dataset=$DATASET \ + --device_num=$RANK_SIZE \ + --device_id=$DEVICE_ID \ + --epoch_size=$EPOCH_SIZE > log.txt 2>&1 & + fi + + if [ $# == 6 ] + then + python ../train.py \ + --distribute=1 \ + --lr=0.4 \ + --dataset=$DATASET \ + --device_num=$RANK_SIZE \ + --device_id=$DEVICE_ID \ + --pre_trained=$PRE_TRAINED \ + --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \ + --epoch_size=$EPOCH_SIZE > log.txt 2>&1 & + fi + cd ../ done diff --git a/example/ssd_coco2017/train.py b/example/ssd_coco2017/train.py index 75f9a6d31..9347bf61c 100644 --- a/example/ssd_coco2017/train.py +++ b/example/ssd_coco2017/train.py @@ -88,6 +88,7 @@ def main(): parser.add_argument("--epoch_size", type=int, default=70, help="Epoch size, default is 70.") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.") + parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.") parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") args_opt = parser.parse_args() @@ -150,17 +151,20 @@ def main(): ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=None, config=ckpt_config) - lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=args_opt.lr, - warmup_epochs=max(args_opt.epoch_size // 20, 1), - total_epochs=args_opt.epoch_size, - steps_per_epoch=dataset_size)) - opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale) - net = TrainingWrapper(net, opt, loss_scale) - if args_opt.pre_trained: + if args_opt.pre_trained_epoch_size <= 0: + raise KeyError("pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) + lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size, + lr_init=0, lr_end=0, lr_max=args_opt.lr, + warmup_epochs=max(350 // 20, 1), + total_epochs=350, + steps_per_epoch=dataset_size)) + opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale) + net = TrainingWrapper(net, opt, loss_scale) + callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb] model = Model(net) diff --git a/example/yolov3_coco2017/run_distribute_train.sh b/example/yolov3_coco2017/run_distribute_train.sh index 201f19ca1..0b764419d 100644 --- a/example/yolov3_coco2017/run_distribute_train.sh +++ b/example/yolov3_coco2017/run_distribute_train.sh @@ -14,18 +14,27 @@ # limitations under the License. # ============================================================================ -echo "==============================================================================================================" +echo "=======================================================================================================================================================" echo "Please run the scipt as: " -echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH" -echo "for example: sh run_distribute_train.sh 8 100 /data/Mindrecord_train /data /data/train.txt /data/hccl.json" +echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" +echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)" echo "It is better to use absolute path." echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script." -echo "==============================================================================================================" +echo "=======================================================================================================================================================" + +if [ $# != 6 ] && [ $# != 8 ] +then + echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \ +[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" + exit 1 +fi EPOCH_SIZE=$2 MINDRECORD_DIR=$3 IMAGE_DIR=$4 ANNO_PATH=$5 +PRE_TRAINED=$7 +PRE_TRAINED_EPOCH_SIZE=$8 # Before start distribute train, first create mindrecord files. python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR \ @@ -51,14 +60,34 @@ do export RANK_ID=$i echo "start training for rank $i, device $DEVICE_ID" env > env.log - taskset -c $cmdopt python ../train.py \ - --distribute=1 \ - --lr=0.005 \ - --device_num=$RANK_SIZE \ - --device_id=$DEVICE_ID \ - --mindrecord_dir=$MINDRECORD_DIR \ - --image_dir=$IMAGE_DIR \ - --epoch_size=$EPOCH_SIZE \ - --anno_path=$ANNO_PATH > log.txt 2>&1 & + + if [ $# == 6 ] + then + taskset -c $cmdopt python ../train.py \ + --distribute=1 \ + --lr=0.005 \ + --device_num=$RANK_SIZE \ + --device_id=$DEVICE_ID \ + --mindrecord_dir=$MINDRECORD_DIR \ + --image_dir=$IMAGE_DIR \ + --epoch_size=$EPOCH_SIZE \ + --anno_path=$ANNO_PATH > log.txt 2>&1 & + fi + + if [ $# == 8 ] + then + taskset -c $cmdopt python ../train.py \ + --distribute=1 \ + --lr=0.005 \ + --device_num=$RANK_SIZE \ + --device_id=$DEVICE_ID \ + --mindrecord_dir=$MINDRECORD_DIR \ + --image_dir=$IMAGE_DIR \ + --epoch_size=$EPOCH_SIZE \ + --pre_trained=$PRE_TRAINED \ + --pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \ + --anno_path=$ANNO_PATH > log.txt 2>&1 & + fi + cd ../ done diff --git a/example/yolov3_coco2017/run_standalone_train.sh b/example/yolov3_coco2017/run_standalone_train.sh index d91d53700..8bce45d89 100644 --- a/example/yolov3_coco2017/run_standalone_train.sh +++ b/example/yolov3_coco2017/run_standalone_train.sh @@ -14,10 +14,25 @@ # limitations under the License. # ============================================================================ -echo "==============================================================================================================" +echo "=========================================================================================================================================" echo "Please run the scipt as: " -echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH" -echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt" -echo "==============================================================================================================" +echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" +echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)" +echo "=========================================================================================================================================" -python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 +if [ $# != 5 ] && [ $# != 7 ] +then + echo "Usage: sh run_standalone_train.sh [DEVICE_ID] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] \ +[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" + exit 1 +fi + +if [ $# == 5 ] +then + python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 +fi + +if [ $# == 7 ] +then + python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7 +fi diff --git a/example/yolov3_coco2017/train.py b/example/yolov3_coco2017/train.py index 62329bf88..683d7bd78 100644 --- a/example/yolov3_coco2017/train.py +++ b/example/yolov3_coco2017/train.py @@ -71,6 +71,7 @@ def main(): parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10") parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.") parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path") + parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size") parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.") parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.") parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_train", @@ -133,14 +134,19 @@ def main(): ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs) ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config) - lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=0, global_step=args_opt.epoch_size * dataset_size, - decay_step=1000, decay_rate=0.95, steps=True)) - opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale) - net = TrainingWrapper(net, opt, loss_scale) - if args_opt.pre_trained: + if args_opt.pre_trained_epoch_size <= 0: + raise KeyError("pre_trained_epoch_size must be greater than 0.") param_dict = load_checkpoint(args_opt.pre_trained) load_param_into_net(net, param_dict) + total_epoch_size = 60 + if args_opt.distribute: + total_epoch_size = 160 + lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size, + global_step=total_epoch_size * dataset_size, + decay_step=1000, decay_rate=0.95, steps=True)) + opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale) + net = TrainingWrapper(net, opt, loss_scale) callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb] -- GitLab