diff --git a/dygraph/mobilenet/RADEME.md b/dygraph/mobilenet/RADEME.md deleted file mode 100644 index beee2f1b00b32e19d1f9f25caf1695c465ee2984..0000000000000000000000000000000000000000 --- a/dygraph/mobilenet/RADEME.md +++ /dev/null @@ -1,44 +0,0 @@ -**模型简介** - -图像分类是计算机视觉的重要领域,它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果,同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型,本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。 - -**代码结构** - - ├── run_mul_v1.sh # 多卡训练启动脚本_v1 - ├── run_mul_v2.sh # 多卡训练启动脚本_v2 - ├── run_sing_v1.sh # 单卡训练启动脚本_v1 - ├── run_sing_v2.sh # 单卡训练启动脚本_v2 - ├── train.py # 训练入口 - ├── mobilenet_v1.py # 网络结构v1 - ├── mobilenet_v2.py # 网络结构v2 - ├── reader.py # 数据reader - ├── utils # 基础工具目录 - -**数据准备** - -请参考:https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification - -**模型训练** - -若使用4卡训练,启动方式如下: - - bash run_mul_v1.sh - bash run_mul_v2.sh -若使用单卡训练,启动方式如下: - - bash run_sing_v1.sh - bash run_sing_v2.sh - -**模型精度** - - Model Top-1 Top-5 - - MobileNetV1 0.707 0.895 - - MobileNetV2 0.626 0.845 - -**参考论文** - -MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam - -MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen diff --git a/dygraph/mobilenet/README.md b/dygraph/mobilenet/README.md index 5d3a3e64698727ec4d6eb16b7b2b378e71b3ad16..c2e0477e753d5abe9f90ed17682ad0e58f8e52e3 100644 --- a/dygraph/mobilenet/README.md +++ b/dygraph/mobilenet/README.md @@ -4,17 +4,21 @@ **代码结构** - ├── run_mul_v1.sh # 多卡训练启动脚本_v1 - ├── run_mul_v2.sh # 多卡训练启动脚本_v2 - ├── run_sing_v1.sh # 单卡训练启动脚本_v1 - ├── run_sing_v2.sh # 单卡训练启动脚本_v2 - ├── run_cpu_v1.sh # CPU训练启动脚本_v1 - ├── run_cpu_v2.sh # CPU训练启动脚本_v2 - ├── train.py # 训练入口 - ├── mobilenet_v1.py # 网络结构v1 - ├── mobilenet_v2.py # 网络结构v2 - ├── reader.py # 数据reader - ├── utils # 基础工具目录 + ├── run_mul_v1.sh # 多卡训练启动脚本_v1 + ├── run_mul_v1_checkpoint.sh # 加载checkpoint多卡训练启动脚本_v1 + ├── run_mul_v2.sh # 多卡训练启动脚本_v2 + ├── run_mul_v2_checkpoint.sh # 加载checkpoint多卡训练启动脚本_v2 + ├── run_sing_v1.sh # 单卡训练启动脚本_v1 + ├── run_sing_v1_checkpoint.sh # 加载checkpoint单卡训练启动脚本_v1 + ├── run_sing_v2.sh # 单卡训练启动脚本_v2 + ├── run_sing_v2_checkpoint.sh # 加载checkpoint单卡训练启动脚本_v2 + ├── run_cpu_v1.sh # CPU训练启动脚本_v1 + ├── run_cpu_v2.sh # CPU训练启动脚本_v2 + ├── train.py # 训练入口 + ├── mobilenet_v1.py # 网络结构v1 + ├── mobilenet_v2.py # 网络结构v2 + ├── reader.py # 数据reader + ├── utils # 基础工具目录 **数据准备** @@ -26,6 +30,7 @@ bash run_mul_v1.sh bash run_mul_v2.sh + 若使用单卡训练,启动方式如下: bash run_sing_v1.sh @@ -36,6 +41,16 @@ bash run_cpu_v1.sh bash run_cpu_v2.sh +训练过程中,checkpoint会保存在参数model_save_dir指定的文件夹中,我们支持加载checkpoint继续训练. +加载checkpoint使用4卡训练,启动方式如下: + + bash run_mul_v1_checkpoint.sh + bash run_mul_v2_checkpoint.sh + +加载checkpoint使用单卡训练,启动方式如下: + + bash run_sing_v1_checkpoint.sh + bash run_sing_v2_checkpoint.sh **模型性能** diff --git a/dygraph/mobilenet/mobilenet_v1.py b/dygraph/mobilenet/mobilenet_v1.py index 56c12b9a4d96d292bce2a68633ceff6f40e732cc..e3a5a94eab46477a8fb9676f5a5bf67000783018 100644 --- a/dygraph/mobilenet/mobilenet_v1.py +++ b/dygraph/mobilenet/mobilenet_v1.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +#order: standard library, third party, local library import os import time import sys +import math import numpy as np import argparse -import ast import paddle import paddle.fluid as fluid from paddle.fluid.initializer import MSRA @@ -26,8 +27,6 @@ from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.base import to_variable from paddle.fluid import framework -import math -import sys class ConvBNLayer(fluid.dygraph.Layer): diff --git a/dygraph/mobilenet/mobilenet_v2.py b/dygraph/mobilenet/mobilenet_v2.py index 2466d4307dd8482f8e3c5070f9fdf6a1053e085f..6da031f298c1e76c21d6415da4b4fe0dd9715731 100644 --- a/dygraph/mobilenet/mobilenet_v2.py +++ b/dygraph/mobilenet/mobilenet_v2.py @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +#order: standard library, third party, local library import os -import numpy as np import time -import sys +import math import sys import numpy as np import argparse -import ast import paddle import paddle.fluid as fluid from paddle.fluid.initializer import MSRA @@ -27,11 +26,8 @@ from paddle.fluid.param_attr import ParamAttr from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.base import to_variable - from paddle.fluid import framework -import math -import sys class ConvBNLayer(fluid.dygraph.Layer): diff --git a/dygraph/mobilenet/run_mul_v1.sh b/dygraph/mobilenet/run_mul_v1.sh index d84e4d1d7fff1a02045383667c4caa6dc5a0d548..fa48ef5fe46ebfcf86c84a21bc1ecb7ad8a492df 100644 --- a/dygraph/mobilenet/run_mul_v1.sh +++ b/dygraph/mobilenet/run_mul_v1.sh @@ -1,2 +1,2 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3 -python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256 --reader_thread=8 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 +python3 -m paddle.distributed.launch --log_dir ./mylog.v1 train.py --use_data_parallel 1 --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 --model_save_dir=output.v1.mul/ --num_epochs=120 diff --git a/dygraph/mobilenet/run_mul_v2.sh b/dygraph/mobilenet/run_mul_v2.sh index a3f9991e330e25edd7440cf681397d0fd4f78d77..485cad365c3727710678f7426e3238b94c20f6e9 100644 --- a/dygraph/mobilenet/run_mul_v2.sh +++ b/dygraph/mobilenet/run_mul_v2.sh @@ -1,2 +1,2 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3 -python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256 --reader_thread=8 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV2 +python3 -m paddle.distributed.launch --log_dir ./mylog.v2 train.py --use_data_parallel 1 --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2 diff --git a/dygraph/mobilenet/run_sing_v1.sh b/dygraph/mobilenet/run_sing_v1.sh index 3e480faadfba596a139b7709d81b9351ff97a85a..c4fef2984b06aa98b04e9ab0a481530ec3c22034 100644 --- a/dygraph/mobilenet/run_sing_v1.sh +++ b/dygraph/mobilenet/run_sing_v1.sh @@ -1,2 +1,2 @@ export CUDA_VISIBLE_DEVICES=0 -python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 +python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 diff --git a/dygraph/mobilenet/run_sing_v2.sh b/dygraph/mobilenet/run_sing_v2.sh index 9db7a20f169d8019d3b0c8c90b9104321fd1263b..f747ee5e01ba7d8d5c5eb35fb6e732a381a305b9 100644 --- a/dygraph/mobilenet/run_sing_v2.sh +++ b/dygraph/mobilenet/run_sing_v2.sh @@ -1,2 +1,2 @@ export CUDA_VISIBLE_DEVICES=0 -python3 train.py --batch_size=128 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --model=MobileNetV2 +python3 train.py --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2 diff --git a/dygraph/mobilenet/train.py b/dygraph/mobilenet/train.py index 42648b3e73305828fde2f0e4223fb3f27c29cf75..254279baedf3879ada6bc5c92ab3f733e5f3d524 100644 --- a/dygraph/mobilenet/train.py +++ b/dygraph/mobilenet/train.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,35 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -from mobilenet_v1 import * -from mobilenet_v2 import * +#order: standard library, third party, local library import os -import numpy as np import time import sys -import sys -import numpy as np +import math import argparse -import ast +import numpy as np import paddle import paddle.fluid as fluid from paddle.fluid.initializer import MSRA from paddle.fluid.param_attr import ParamAttr from paddle.fluid.layer_helper import LayerHelper -#from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC from paddle.fluid.dygraph.base import to_variable - from paddle.fluid import framework - -import math -import sys import reader from utils import * - -IMAGENET1000 = 1281167 -base_lr = 0.1 -momentum_rate = 0.9 -l2_decay = 1e-4 +from mobilenet_v1 import * +from mobilenet_v2 import * args = parse_args() if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0: @@ -56,7 +45,7 @@ def eval(net, test_data_loader, eop): for img, label in test_data_loader(): t1 = time.time() label = to_variable(label.numpy().astype('int64').reshape( - int(args.batch_size / paddle.fluid.core.get_cuda_device_count()), + int(args.batch_size // paddle.fluid.core.get_cuda_device_count()), 1)) out = net(img) softmax_out = fluid.layers.softmax(out, use_cudnn=False) @@ -80,10 +69,14 @@ def eval(net, test_data_loader, eop): def train_mobilenet(): - epoch = args.num_epochs - place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ - if args.use_data_parallel else fluid.CUDAPlace(0) + if not args.use_gpu: + place = fluid.CPUPlace() + elif not args.use_data_parallel: + place = fluid.CUDAPlace(0) + else: + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) with fluid.dygraph.guard(place): + # 1. init net and optimizer if args.ce: print("ce mode") seed = 33 @@ -93,13 +86,12 @@ def train_mobilenet(): if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() - net = None if args.model == "MobileNetV1": - net = MobileNetV1(class_dim=args.class_dim) - para_name = 'mobilenet_v1_params' + net = MobileNetV1(class_dim=args.class_dim, scale=1.0) + model_path_pre = 'mobilenet_v1' elif args.model == "MobileNetV2": net = MobileNetV2(class_dim=args.class_dim, scale=1.0) - para_name = 'mobilenet_v2_params' + model_path_pre = 'mobilenet_v2' else: print( "wrong model name, please try model = MobileNetV1 or MobileNetV2" @@ -109,6 +101,18 @@ def train_mobilenet(): optimizer = create_optimizer(args=args, parameter_list=net.parameters()) if args.use_data_parallel: net = fluid.dygraph.parallel.DataParallel(net, strategy) + + # 2. load checkpoint + if args.checkpoint: + assert os.path.exists(args.checkpoint + ".pdparams"), \ + "Given dir {}.pdparams not exist.".format(args.checkpoint) + assert os.path.exists(args.checkpoint + ".pdopt"), \ + "Given dir {}.pdopt not exist.".format(args.checkpoint) + para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint) + net.set_dict(para_dict) + optimizer.set_dict(opti_dict) + + # 3. reader train_data_loader, train_data = utility.create_data_loader( is_train=True, args=args) test_data_loader, test_data = utility.create_data_loader( @@ -119,7 +123,9 @@ def train_mobilenet(): test_reader = imagenet_reader.val(settings=args) train_data_loader.set_sample_list_generator(train_reader, place) test_data_loader.set_sample_list_generator(test_reader, place) - for eop in range(epoch): + + # 4. train loop + for eop in range(args.num_epochs): if num_trainers > 1: imagenet_reader.set_shuffle_seed(eop + ( args.random_seed if args.random_seed else 0)) @@ -130,13 +136,17 @@ def train_mobilenet(): total_sample = 0 batch_id = 0 t_last = 0 + # 4.1 for each batch, call net() , backward(), and minimize() for img, label in train_data_loader(): t1 = time.time() label = to_variable(label.numpy().astype('int64').reshape( - int(args.batch_size / + int(args.batch_size // paddle.fluid.core.get_cuda_device_count()), 1)) t_start = time.time() + + # 4.1.1 call net() out = net(img) + t_end = time.time() softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy( @@ -145,14 +155,20 @@ def train_mobilenet(): acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) t_start_back = time.time() + + # 4.1.2 call backward() if args.use_data_parallel: avg_loss = net.scale_loss(avg_loss) avg_loss.backward() net.apply_collective_grads() else: avg_loss.backward() + t_end_back = time.time() + + # 4.1.3 call minimize() optimizer.minimize(avg_loss) + net.clear_gradients() t2 = time.time() train_batch_elapse = t2 - t1 @@ -174,13 +190,31 @@ def train_mobilenet(): print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \ (eop, batch_id, total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse)) - net.eval() - eval(net, test_data_loader, eop) + + # 4.2 save checkpoint save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: - fluid.save_dygraph(net.state_dict(), para_name) + if not os.path.isdir(args.model_save_dir): + os.makedirs(args.model_save_dir) + model_path = os.path.join( + args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop)) + fluid.dygraph.save_dygraph(net.state_dict(), model_path) + fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path) + + # 4.3 validation + net.eval() + eval(net, test_data_loader, eop) + + # 5. save final results + save_parameters = (not args.use_data_parallel) or ( + args.use_data_parallel and + fluid.dygraph.parallel.Env().local_rank == 0) + if save_parameters: + model_path = os.path.join( + args.model_save_dir, "_" + model_path_pre + "_final") + fluid.dygraph.save_dygraph(net.state_dict(), model_path) if __name__ == '__main__':