提交 3bb750d1 编写于 作者: R root

add features: train with cpu, save and load checkpoint

上级 79eac5d4
**模型简介**
图像分类是计算机视觉的重要领域,它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果,同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型,本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。
**代码结构**
├── run_mul_v1.sh # 多卡训练启动脚本_v1
├── run_mul_v2.sh # 多卡训练启动脚本_v2
├── run_sing_v1.sh # 单卡训练启动脚本_v1
├── run_sing_v2.sh # 单卡训练启动脚本_v2
├── train.py # 训练入口
├── mobilenet_v1.py # 网络结构v1
├── mobilenet_v2.py # 网络结构v2
├── reader.py # 数据reader
├── utils # 基础工具目录
**数据准备**
请参考:https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification
**模型训练**
若使用4卡训练,启动方式如下:
bash run_mul_v1.sh
bash run_mul_v2.sh
若使用单卡训练,启动方式如下:
bash run_sing_v1.sh
bash run_sing_v2.sh
**模型精度**
Model Top-1 Top-5
MobileNetV1 0.707 0.895
MobileNetV2 0.626 0.845
**参考论文**
MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam
MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
......@@ -4,17 +4,21 @@
**代码结构**
├── run_mul_v1.sh # 多卡训练启动脚本_v1
├── run_mul_v2.sh # 多卡训练启动脚本_v2
├── run_sing_v1.sh # 单卡训练启动脚本_v1
├── run_sing_v2.sh # 单卡训练启动脚本_v2
├── run_cpu_v1.sh # CPU训练启动脚本_v1
├── run_cpu_v2.sh # CPU训练启动脚本_v2
├── train.py # 训练入口
├── mobilenet_v1.py # 网络结构v1
├── mobilenet_v2.py # 网络结构v2
├── reader.py # 数据reader
├── utils # 基础工具目录
├── run_mul_v1.sh # 多卡训练启动脚本_v1
├── run_mul_v1_checkpoint.sh # 加载checkpoint多卡训练启动脚本_v1
├── run_mul_v2.sh # 多卡训练启动脚本_v2
├── run_mul_v2_checkpoint.sh # 加载checkpoint多卡训练启动脚本_v2
├── run_sing_v1.sh # 单卡训练启动脚本_v1
├── run_sing_v1_checkpoint.sh # 加载checkpoint单卡训练启动脚本_v1
├── run_sing_v2.sh # 单卡训练启动脚本_v2
├── run_sing_v2_checkpoint.sh # 加载checkpoint单卡训练启动脚本_v2
├── run_cpu_v1.sh # CPU训练启动脚本_v1
├── run_cpu_v2.sh # CPU训练启动脚本_v2
├── train.py # 训练入口
├── mobilenet_v1.py # 网络结构v1
├── mobilenet_v2.py # 网络结构v2
├── reader.py # 数据reader
├── utils # 基础工具目录
**数据准备**
......@@ -26,6 +30,7 @@
bash run_mul_v1.sh
bash run_mul_v2.sh
若使用单卡训练,启动方式如下:
bash run_sing_v1.sh
......@@ -36,6 +41,16 @@
bash run_cpu_v1.sh
bash run_cpu_v2.sh
训练过程中,checkpoint会保存在参数model_save_dir指定的文件夹中,我们支持加载checkpoint继续训练.
加载checkpoint使用4卡训练,启动方式如下:
bash run_mul_v1_checkpoint.sh
bash run_mul_v2_checkpoint.sh
加载checkpoint使用单卡训练,启动方式如下:
bash run_sing_v1_checkpoint.sh
bash run_sing_v2_checkpoint.sh
**模型性能**
......
......@@ -12,12 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#order: standard library, third party, local library
import os
import time
import sys
import math
import numpy as np
import argparse
import ast
import paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
......@@ -26,8 +27,6 @@ from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework
import math
import sys
class ConvBNLayer(fluid.dygraph.Layer):
......
......@@ -12,14 +12,13 @@
# See the License for the specific language governing permissions and
# limitations under the License.
#order: standard library, third party, local library
import os
import numpy as np
import time
import sys
import math
import sys
import numpy as np
import argparse
import ast
import paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
......@@ -27,11 +26,8 @@ from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework
import math
import sys
class ConvBNLayer(fluid.dygraph.Layer):
......
export CUDA_VISIBLE_DEVICES=0,1,2,3
python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256 --reader_thread=8 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1
python3 -m paddle.distributed.launch --log_dir ./mylog.v1 train.py --use_data_parallel 1 --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 --model_save_dir=output.v1.mul/ --num_epochs=120
export CUDA_VISIBLE_DEVICES=0,1,2,3
python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256 --reader_thread=8 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV2
python3 -m paddle.distributed.launch --log_dir ./mylog.v2 train.py --use_data_parallel 1 --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2
export CUDA_VISIBLE_DEVICES=0
python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1
python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1
export CUDA_VISIBLE_DEVICES=0
python3 train.py --batch_size=128 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --model=MobileNetV2
python3 train.py --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,35 +12,24 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from mobilenet_v1 import *
from mobilenet_v2 import *
#order: standard library, third party, local library
import os
import numpy as np
import time
import sys
import sys
import numpy as np
import math
import argparse
import ast
import numpy as np
import paddle
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
#from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework
import math
import sys
import reader
from utils import *
IMAGENET1000 = 1281167
base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
from mobilenet_v1 import *
from mobilenet_v2 import *
args = parse_args()
if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
......@@ -56,7 +45,7 @@ def eval(net, test_data_loader, eop):
for img, label in test_data_loader():
t1 = time.time()
label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size / paddle.fluid.core.get_cuda_device_count()),
int(args.batch_size // paddle.fluid.core.get_cuda_device_count()),
1))
out = net(img)
softmax_out = fluid.layers.softmax(out, use_cudnn=False)
......@@ -80,10 +69,14 @@ def eval(net, test_data_loader, eop):
def train_mobilenet():
epoch = args.num_epochs
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
if args.use_data_parallel else fluid.CUDAPlace(0)
if not args.use_gpu:
place = fluid.CPUPlace()
elif not args.use_data_parallel:
place = fluid.CUDAPlace(0)
else:
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
with fluid.dygraph.guard(place):
# 1. init net and optimizer
if args.ce:
print("ce mode")
seed = 33
......@@ -93,13 +86,12 @@ def train_mobilenet():
if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
net = None
if args.model == "MobileNetV1":
net = MobileNetV1(class_dim=args.class_dim)
para_name = 'mobilenet_v1_params'
net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
model_path_pre = 'mobilenet_v1'
elif args.model == "MobileNetV2":
net = MobileNetV2(class_dim=args.class_dim, scale=1.0)
para_name = 'mobilenet_v2_params'
model_path_pre = 'mobilenet_v2'
else:
print(
"wrong model name, please try model = MobileNetV1 or MobileNetV2"
......@@ -109,6 +101,18 @@ def train_mobilenet():
optimizer = create_optimizer(args=args, parameter_list=net.parameters())
if args.use_data_parallel:
net = fluid.dygraph.parallel.DataParallel(net, strategy)
# 2. load checkpoint
if args.checkpoint:
assert os.path.exists(args.checkpoint + ".pdparams"), \
"Given dir {}.pdparams not exist.".format(args.checkpoint)
assert os.path.exists(args.checkpoint + ".pdopt"), \
"Given dir {}.pdopt not exist.".format(args.checkpoint)
para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
net.set_dict(para_dict)
optimizer.set_dict(opti_dict)
# 3. reader
train_data_loader, train_data = utility.create_data_loader(
is_train=True, args=args)
test_data_loader, test_data = utility.create_data_loader(
......@@ -119,7 +123,9 @@ def train_mobilenet():
test_reader = imagenet_reader.val(settings=args)
train_data_loader.set_sample_list_generator(train_reader, place)
test_data_loader.set_sample_list_generator(test_reader, place)
for eop in range(epoch):
# 4. train loop
for eop in range(args.num_epochs):
if num_trainers > 1:
imagenet_reader.set_shuffle_seed(eop + (
args.random_seed if args.random_seed else 0))
......@@ -130,13 +136,17 @@ def train_mobilenet():
total_sample = 0
batch_id = 0
t_last = 0
# 4.1 for each batch, call net() , backward(), and minimize()
for img, label in train_data_loader():
t1 = time.time()
label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size /
int(args.batch_size //
paddle.fluid.core.get_cuda_device_count()), 1))
t_start = time.time()
# 4.1.1 call net()
out = net(img)
t_end = time.time()
softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy(
......@@ -145,14 +155,20 @@ def train_mobilenet():
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
t_start_back = time.time()
# 4.1.2 call backward()
if args.use_data_parallel:
avg_loss = net.scale_loss(avg_loss)
avg_loss.backward()
net.apply_collective_grads()
else:
avg_loss.backward()
t_end_back = time.time()
# 4.1.3 call minimize()
optimizer.minimize(avg_loss)
net.clear_gradients()
t2 = time.time()
train_batch_elapse = t2 - t1
......@@ -174,13 +190,31 @@ def train_mobilenet():
print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
(eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
net.eval()
eval(net, test_data_loader, eop)
# 4.2 save checkpoint
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
fluid.dygraph.parallel.Env().local_rank == 0)
if save_parameters:
fluid.save_dygraph(net.state_dict(), para_name)
if not os.path.isdir(args.model_save_dir):
os.makedirs(args.model_save_dir)
model_path = os.path.join(
args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop))
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
# 4.3 validation
net.eval()
eval(net, test_data_loader, eop)
# 5. save final results
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
fluid.dygraph.parallel.Env().local_rank == 0)
if save_parameters:
model_path = os.path.join(
args.model_save_dir, "_" + model_path_pre + "_final")
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册