提交 3bb750d1 编写于 作者: R root

add features: train with cpu, save and load checkpoint

上级 79eac5d4
**模型简介**
图像分类是计算机视觉的重要领域,它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果,同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型,本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。
**代码结构**
├── run_mul_v1.sh # 多卡训练启动脚本_v1
├── run_mul_v2.sh # 多卡训练启动脚本_v2
├── run_sing_v1.sh # 单卡训练启动脚本_v1
├── run_sing_v2.sh # 单卡训练启动脚本_v2
├── train.py # 训练入口
├── mobilenet_v1.py # 网络结构v1
├── mobilenet_v2.py # 网络结构v2
├── reader.py # 数据reader
├── utils # 基础工具目录
**数据准备**
请参考:https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification
**模型训练**
若使用4卡训练,启动方式如下:
bash run_mul_v1.sh
bash run_mul_v2.sh
若使用单卡训练,启动方式如下:
bash run_sing_v1.sh
bash run_sing_v2.sh
**模型精度**
Model Top-1 Top-5
MobileNetV1 0.707 0.895
MobileNetV2 0.626 0.845
**参考论文**
MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam
MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen
...@@ -4,17 +4,21 @@ ...@@ -4,17 +4,21 @@
**代码结构** **代码结构**
├── run_mul_v1.sh # 多卡训练启动脚本_v1 ├── run_mul_v1.sh # 多卡训练启动脚本_v1
├── run_mul_v2.sh # 多卡训练启动脚本_v2 ├── run_mul_v1_checkpoint.sh # 加载checkpoint多卡训练启动脚本_v1
├── run_sing_v1.sh # 单卡训练启动脚本_v1 ├── run_mul_v2.sh # 多卡训练启动脚本_v2
├── run_sing_v2.sh # 单卡训练启动脚本_v2 ├── run_mul_v2_checkpoint.sh # 加载checkpoint多卡训练启动脚本_v2
├── run_cpu_v1.sh # CPU训练启动脚本_v1 ├── run_sing_v1.sh # 单卡训练启动脚本_v1
├── run_cpu_v2.sh # CPU训练启动脚本_v2 ├── run_sing_v1_checkpoint.sh # 加载checkpoint单卡训练启动脚本_v1
├── train.py # 训练入口 ├── run_sing_v2.sh # 单卡训练启动脚本_v2
├── mobilenet_v1.py # 网络结构v1 ├── run_sing_v2_checkpoint.sh # 加载checkpoint单卡训练启动脚本_v2
├── mobilenet_v2.py # 网络结构v2 ├── run_cpu_v1.sh # CPU训练启动脚本_v1
├── reader.py # 数据reader ├── run_cpu_v2.sh # CPU训练启动脚本_v2
├── utils # 基础工具目录 ├── train.py # 训练入口
├── mobilenet_v1.py # 网络结构v1
├── mobilenet_v2.py # 网络结构v2
├── reader.py # 数据reader
├── utils # 基础工具目录
**数据准备** **数据准备**
...@@ -26,6 +30,7 @@ ...@@ -26,6 +30,7 @@
bash run_mul_v1.sh bash run_mul_v1.sh
bash run_mul_v2.sh bash run_mul_v2.sh
若使用单卡训练,启动方式如下: 若使用单卡训练,启动方式如下:
bash run_sing_v1.sh bash run_sing_v1.sh
...@@ -36,6 +41,16 @@ ...@@ -36,6 +41,16 @@
bash run_cpu_v1.sh bash run_cpu_v1.sh
bash run_cpu_v2.sh bash run_cpu_v2.sh
训练过程中,checkpoint会保存在参数model_save_dir指定的文件夹中,我们支持加载checkpoint继续训练.
加载checkpoint使用4卡训练,启动方式如下:
bash run_mul_v1_checkpoint.sh
bash run_mul_v2_checkpoint.sh
加载checkpoint使用单卡训练,启动方式如下:
bash run_sing_v1_checkpoint.sh
bash run_sing_v2_checkpoint.sh
**模型性能** **模型性能**
......
...@@ -12,12 +12,13 @@ ...@@ -12,12 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#order: standard library, third party, local library
import os import os
import time import time
import sys import sys
import math
import numpy as np import numpy as np
import argparse import argparse
import ast
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA from paddle.fluid.initializer import MSRA
...@@ -26,8 +27,6 @@ from paddle.fluid.layer_helper import LayerHelper ...@@ -26,8 +27,6 @@ from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework from paddle.fluid import framework
import math
import sys
class ConvBNLayer(fluid.dygraph.Layer): class ConvBNLayer(fluid.dygraph.Layer):
......
...@@ -12,14 +12,13 @@ ...@@ -12,14 +12,13 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
#order: standard library, third party, local library
import os import os
import numpy as np
import time import time
import sys import math
import sys import sys
import numpy as np import numpy as np
import argparse import argparse
import ast
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA from paddle.fluid.initializer import MSRA
...@@ -27,11 +26,8 @@ from paddle.fluid.param_attr import ParamAttr ...@@ -27,11 +26,8 @@ from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework from paddle.fluid import framework
import math
import sys
class ConvBNLayer(fluid.dygraph.Layer): class ConvBNLayer(fluid.dygraph.Layer):
......
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256 --reader_thread=8 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 python3 -m paddle.distributed.launch --log_dir ./mylog.v1 train.py --use_data_parallel 1 --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 --model_save_dir=output.v1.mul/ --num_epochs=120
export CUDA_VISIBLE_DEVICES=0,1,2,3 export CUDA_VISIBLE_DEVICES=0,1,2,3
python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256 --reader_thread=8 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV2 python3 -m paddle.distributed.launch --log_dir ./mylog.v2 train.py --use_data_parallel 1 --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1
export CUDA_VISIBLE_DEVICES=0 export CUDA_VISIBLE_DEVICES=0
python3 train.py --batch_size=128 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --model=MobileNetV2 python3 train.py --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,35 +12,24 @@ ...@@ -12,35 +12,24 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from mobilenet_v1 import * #order: standard library, third party, local library
from mobilenet_v2 import *
import os import os
import numpy as np
import time import time
import sys import sys
import sys import math
import numpy as np
import argparse import argparse
import ast import numpy as np
import paddle import paddle
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.layer_helper import LayerHelper
#from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC
from paddle.fluid.dygraph.base import to_variable from paddle.fluid.dygraph.base import to_variable
from paddle.fluid import framework from paddle.fluid import framework
import math
import sys
import reader import reader
from utils import * from utils import *
from mobilenet_v1 import *
IMAGENET1000 = 1281167 from mobilenet_v2 import *
base_lr = 0.1
momentum_rate = 0.9
l2_decay = 1e-4
args = parse_args() args = parse_args()
if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0: if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0:
...@@ -56,7 +45,7 @@ def eval(net, test_data_loader, eop): ...@@ -56,7 +45,7 @@ def eval(net, test_data_loader, eop):
for img, label in test_data_loader(): for img, label in test_data_loader():
t1 = time.time() t1 = time.time()
label = to_variable(label.numpy().astype('int64').reshape( label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size / paddle.fluid.core.get_cuda_device_count()), int(args.batch_size // paddle.fluid.core.get_cuda_device_count()),
1)) 1))
out = net(img) out = net(img)
softmax_out = fluid.layers.softmax(out, use_cudnn=False) softmax_out = fluid.layers.softmax(out, use_cudnn=False)
...@@ -80,10 +69,14 @@ def eval(net, test_data_loader, eop): ...@@ -80,10 +69,14 @@ def eval(net, test_data_loader, eop):
def train_mobilenet(): def train_mobilenet():
epoch = args.num_epochs if not args.use_gpu:
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ place = fluid.CPUPlace()
if args.use_data_parallel else fluid.CUDAPlace(0) elif not args.use_data_parallel:
place = fluid.CUDAPlace(0)
else:
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
with fluid.dygraph.guard(place): with fluid.dygraph.guard(place):
# 1. init net and optimizer
if args.ce: if args.ce:
print("ce mode") print("ce mode")
seed = 33 seed = 33
...@@ -93,13 +86,12 @@ def train_mobilenet(): ...@@ -93,13 +86,12 @@ def train_mobilenet():
if args.use_data_parallel: if args.use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context() strategy = fluid.dygraph.parallel.prepare_context()
net = None
if args.model == "MobileNetV1": if args.model == "MobileNetV1":
net = MobileNetV1(class_dim=args.class_dim) net = MobileNetV1(class_dim=args.class_dim, scale=1.0)
para_name = 'mobilenet_v1_params' model_path_pre = 'mobilenet_v1'
elif args.model == "MobileNetV2": elif args.model == "MobileNetV2":
net = MobileNetV2(class_dim=args.class_dim, scale=1.0) net = MobileNetV2(class_dim=args.class_dim, scale=1.0)
para_name = 'mobilenet_v2_params' model_path_pre = 'mobilenet_v2'
else: else:
print( print(
"wrong model name, please try model = MobileNetV1 or MobileNetV2" "wrong model name, please try model = MobileNetV1 or MobileNetV2"
...@@ -109,6 +101,18 @@ def train_mobilenet(): ...@@ -109,6 +101,18 @@ def train_mobilenet():
optimizer = create_optimizer(args=args, parameter_list=net.parameters()) optimizer = create_optimizer(args=args, parameter_list=net.parameters())
if args.use_data_parallel: if args.use_data_parallel:
net = fluid.dygraph.parallel.DataParallel(net, strategy) net = fluid.dygraph.parallel.DataParallel(net, strategy)
# 2. load checkpoint
if args.checkpoint:
assert os.path.exists(args.checkpoint + ".pdparams"), \
"Given dir {}.pdparams not exist.".format(args.checkpoint)
assert os.path.exists(args.checkpoint + ".pdopt"), \
"Given dir {}.pdopt not exist.".format(args.checkpoint)
para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
net.set_dict(para_dict)
optimizer.set_dict(opti_dict)
# 3. reader
train_data_loader, train_data = utility.create_data_loader( train_data_loader, train_data = utility.create_data_loader(
is_train=True, args=args) is_train=True, args=args)
test_data_loader, test_data = utility.create_data_loader( test_data_loader, test_data = utility.create_data_loader(
...@@ -119,7 +123,9 @@ def train_mobilenet(): ...@@ -119,7 +123,9 @@ def train_mobilenet():
test_reader = imagenet_reader.val(settings=args) test_reader = imagenet_reader.val(settings=args)
train_data_loader.set_sample_list_generator(train_reader, place) train_data_loader.set_sample_list_generator(train_reader, place)
test_data_loader.set_sample_list_generator(test_reader, place) test_data_loader.set_sample_list_generator(test_reader, place)
for eop in range(epoch):
# 4. train loop
for eop in range(args.num_epochs):
if num_trainers > 1: if num_trainers > 1:
imagenet_reader.set_shuffle_seed(eop + ( imagenet_reader.set_shuffle_seed(eop + (
args.random_seed if args.random_seed else 0)) args.random_seed if args.random_seed else 0))
...@@ -130,13 +136,17 @@ def train_mobilenet(): ...@@ -130,13 +136,17 @@ def train_mobilenet():
total_sample = 0 total_sample = 0
batch_id = 0 batch_id = 0
t_last = 0 t_last = 0
# 4.1 for each batch, call net() , backward(), and minimize()
for img, label in train_data_loader(): for img, label in train_data_loader():
t1 = time.time() t1 = time.time()
label = to_variable(label.numpy().astype('int64').reshape( label = to_variable(label.numpy().astype('int64').reshape(
int(args.batch_size / int(args.batch_size //
paddle.fluid.core.get_cuda_device_count()), 1)) paddle.fluid.core.get_cuda_device_count()), 1))
t_start = time.time() t_start = time.time()
# 4.1.1 call net()
out = net(img) out = net(img)
t_end = time.time() t_end = time.time()
softmax_out = fluid.layers.softmax(out, use_cudnn=False) softmax_out = fluid.layers.softmax(out, use_cudnn=False)
loss = fluid.layers.cross_entropy( loss = fluid.layers.cross_entropy(
...@@ -145,14 +155,20 @@ def train_mobilenet(): ...@@ -145,14 +155,20 @@ def train_mobilenet():
acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1)
acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5)
t_start_back = time.time() t_start_back = time.time()
# 4.1.2 call backward()
if args.use_data_parallel: if args.use_data_parallel:
avg_loss = net.scale_loss(avg_loss) avg_loss = net.scale_loss(avg_loss)
avg_loss.backward() avg_loss.backward()
net.apply_collective_grads() net.apply_collective_grads()
else: else:
avg_loss.backward() avg_loss.backward()
t_end_back = time.time() t_end_back = time.time()
# 4.1.3 call minimize()
optimizer.minimize(avg_loss) optimizer.minimize(avg_loss)
net.clear_gradients() net.clear_gradients()
t2 = time.time() t2 = time.time()
train_batch_elapse = t2 - t1 train_batch_elapse = t2 - t1
...@@ -174,13 +190,31 @@ def train_mobilenet(): ...@@ -174,13 +190,31 @@ def train_mobilenet():
print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \ print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \
(eop, batch_id, total_loss / total_sample, \ (eop, batch_id, total_loss / total_sample, \
total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse)) total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse))
net.eval()
eval(net, test_data_loader, eop) # 4.2 save checkpoint
save_parameters = (not args.use_data_parallel) or ( save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and args.use_data_parallel and
fluid.dygraph.parallel.Env().local_rank == 0) fluid.dygraph.parallel.Env().local_rank == 0)
if save_parameters: if save_parameters:
fluid.save_dygraph(net.state_dict(), para_name) if not os.path.isdir(args.model_save_dir):
os.makedirs(args.model_save_dir)
model_path = os.path.join(
args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop))
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
# 4.3 validation
net.eval()
eval(net, test_data_loader, eop)
# 5. save final results
save_parameters = (not args.use_data_parallel) or (
args.use_data_parallel and
fluid.dygraph.parallel.Env().local_rank == 0)
if save_parameters:
model_path = os.path.join(
args.model_save_dir, "_" + model_path_pre + "_final")
fluid.dygraph.save_dygraph(net.state_dict(), model_path)
if __name__ == '__main__': if __name__ == '__main__':
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册