From 22b8805bb5a6a6ee28fb228fc4ce7fee4791bef1 Mon Sep 17 00:00:00 2001 From: chajchaj <57249073+chajchaj@users.noreply.github.com> Date: Wed, 12 Feb 2020 01:06:42 +0800 Subject: [PATCH] add features: train with cpu, save and load checkpoint (#4259) --- dygraph/mobilenet/RADEME.md | 44 ---------- dygraph/mobilenet/README.md | 67 +++++++++++++++ dygraph/mobilenet/mobilenet_v1.py | 5 +- dygraph/mobilenet/mobilenet_v2.py | 8 +- dygraph/mobilenet/run_cpu_v1.sh | 1 + dygraph/mobilenet/run_cpu_v2.sh | 1 + dygraph/mobilenet/run_mul_v1.sh | 2 +- dygraph/mobilenet/run_mul_v1_checkpoint.sh | 2 + dygraph/mobilenet/run_mul_v2.sh | 2 +- dygraph/mobilenet/run_mul_v2_checkpoint.sh | 2 + dygraph/mobilenet/run_sing_v1.sh | 2 +- dygraph/mobilenet/run_sing_v1_checkpoint.sh | 2 + dygraph/mobilenet/run_sing_v2.sh | 2 +- dygraph/mobilenet/run_sing_v2_checkpoint.sh | 2 + dygraph/mobilenet/train.py | 94 ++++++++++++++------- 15 files changed, 149 insertions(+), 87 deletions(-) delete mode 100644 dygraph/mobilenet/RADEME.md create mode 100644 dygraph/mobilenet/README.md create mode 100644 dygraph/mobilenet/run_cpu_v1.sh create mode 100644 dygraph/mobilenet/run_cpu_v2.sh create mode 100644 dygraph/mobilenet/run_mul_v1_checkpoint.sh create mode 100644 dygraph/mobilenet/run_mul_v2_checkpoint.sh create mode 100644 dygraph/mobilenet/run_sing_v1_checkpoint.sh create mode 100644 dygraph/mobilenet/run_sing_v2_checkpoint.sh diff --git a/dygraph/mobilenet/RADEME.md b/dygraph/mobilenet/RADEME.md deleted file mode 100644 index beee2f1b..00000000 --- a/dygraph/mobilenet/RADEME.md +++ /dev/null @@ -1,44 +0,0 @@ -**模型简介** - -图像分类是计算机视觉的重要领域,它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果,同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型,本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。 - -**代码结构** - - ├── run_mul_v1.sh # 多卡训练启动脚本_v1 - ├── run_mul_v2.sh # 多卡训练启动脚本_v2 - ├── run_sing_v1.sh # 单卡训练启动脚本_v1 - ├── run_sing_v2.sh # 单卡训练启动脚本_v2 - ├── train.py # 训练入口 - ├── mobilenet_v1.py # 网络结构v1 - ├── mobilenet_v2.py # 网络结构v2 - ├── reader.py # 数据reader - ├── utils # 基础工具目录 - -**数据准备** - -请参考:https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification - -**模型训练** - -若使用4卡训练,启动方式如下: - - bash run_mul_v1.sh - bash run_mul_v2.sh -若使用单卡训练,启动方式如下: - - bash run_sing_v1.sh - bash run_sing_v2.sh - -**模型精度** - - Model Top-1 Top-5 - - MobileNetV1 0.707 0.895 - - MobileNetV2 0.626 0.845 - -**参考论文** - -MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam - -MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen diff --git a/dygraph/mobilenet/README.md b/dygraph/mobilenet/README.md new file mode 100644 index 00000000..c2e0477e --- /dev/null +++ b/dygraph/mobilenet/README.md @@ -0,0 +1,67 @@ +**模型简介** + +图像分类是计算机视觉的重要领域,它的目标是将图像分类到预定义的标签。CNN模型在图像分类领域取得了突破的成果,同时模型复杂度也在不断增加。MobileNet是一种小巧而高效CNN模型,本文介绍如何使PaddlePaddle的动态图MobileNet进行图像分类。 + +**代码结构** + + ├── run_mul_v1.sh # 多卡训练启动脚本_v1 + ├── run_mul_v1_checkpoint.sh # 加载checkpoint多卡训练启动脚本_v1 + ├── run_mul_v2.sh # 多卡训练启动脚本_v2 + ├── run_mul_v2_checkpoint.sh # 加载checkpoint多卡训练启动脚本_v2 + ├── run_sing_v1.sh # 单卡训练启动脚本_v1 + ├── run_sing_v1_checkpoint.sh # 加载checkpoint单卡训练启动脚本_v1 + ├── run_sing_v2.sh # 单卡训练启动脚本_v2 + ├── run_sing_v2_checkpoint.sh # 加载checkpoint单卡训练启动脚本_v2 + ├── run_cpu_v1.sh # CPU训练启动脚本_v1 + ├── run_cpu_v2.sh # CPU训练启动脚本_v2 + ├── train.py # 训练入口 + ├── mobilenet_v1.py # 网络结构v1 + ├── mobilenet_v2.py # 网络结构v2 + ├── reader.py # 数据reader + ├── utils # 基础工具目录 + +**数据准备** + +请参考:https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/image_classification + +**模型训练** + +若使用4卡训练,启动方式如下: + + bash run_mul_v1.sh + bash run_mul_v2.sh + +若使用单卡训练,启动方式如下: + + bash run_sing_v1.sh + bash run_sing_v2.sh + +若使用CPU训练,启动方式如下: + + bash run_cpu_v1.sh + bash run_cpu_v2.sh + +训练过程中,checkpoint会保存在参数model_save_dir指定的文件夹中,我们支持加载checkpoint继续训练. +加载checkpoint使用4卡训练,启动方式如下: + + bash run_mul_v1_checkpoint.sh + bash run_mul_v2_checkpoint.sh + +加载checkpoint使用单卡训练,启动方式如下: + + bash run_sing_v1_checkpoint.sh + bash run_sing_v2_checkpoint.sh + +**模型性能** + + Model Top-1(单卡/4卡) Top-5(单卡/4卡) 收敛时间(单卡/4卡) + + MobileNetV1 0.707/0.711 0.897/0.899 116小时/30.9小时 + + MobileNetV2 0.708/0.724 0.899/0.906 227.8小时/60.8小时 + +**参考论文** + +MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications, Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam + +MobileNetV2: Inverted Residuals and Linear Bottlenecks, Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen diff --git a/dygraph/mobilenet/mobilenet_v1.py b/dygraph/mobilenet/mobilenet_v1.py index 56c12b9a..e3a5a94e 100644 --- a/dygraph/mobilenet/mobilenet_v1.py +++ b/dygraph/mobilenet/mobilenet_v1.py @@ -12,12 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +#order: standard library, third party, local library import os import time import sys +import math import numpy as np import argparse -import ast import paddle import paddle.fluid as fluid from paddle.fluid.initializer import MSRA @@ -26,8 +27,6 @@ from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.base import to_variable from paddle.fluid import framework -import math -import sys class ConvBNLayer(fluid.dygraph.Layer): diff --git a/dygraph/mobilenet/mobilenet_v2.py b/dygraph/mobilenet/mobilenet_v2.py index 2466d430..6da031f2 100644 --- a/dygraph/mobilenet/mobilenet_v2.py +++ b/dygraph/mobilenet/mobilenet_v2.py @@ -12,14 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +#order: standard library, third party, local library import os -import numpy as np import time -import sys +import math import sys import numpy as np import argparse -import ast import paddle import paddle.fluid as fluid from paddle.fluid.initializer import MSRA @@ -27,11 +26,8 @@ from paddle.fluid.param_attr import ParamAttr from paddle.fluid.layer_helper import LayerHelper from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear from paddle.fluid.dygraph.base import to_variable - from paddle.fluid import framework -import math -import sys class ConvBNLayer(fluid.dygraph.Layer): diff --git a/dygraph/mobilenet/run_cpu_v1.sh b/dygraph/mobilenet/run_cpu_v1.sh new file mode 100644 index 00000000..81de4df3 --- /dev/null +++ b/dygraph/mobilenet/run_cpu_v1.sh @@ -0,0 +1 @@ +python3 train.py --use_gpu=False --batch_size=64 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 diff --git a/dygraph/mobilenet/run_cpu_v2.sh b/dygraph/mobilenet/run_cpu_v2.sh new file mode 100644 index 00000000..4c18c006 --- /dev/null +++ b/dygraph/mobilenet/run_cpu_v2.sh @@ -0,0 +1 @@ +python3 train.py --use_gpu=False --batch_size=64 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=/ssd9/chaj//data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2 diff --git a/dygraph/mobilenet/run_mul_v1.sh b/dygraph/mobilenet/run_mul_v1.sh index d84e4d1d..fa48ef5f 100644 --- a/dygraph/mobilenet/run_mul_v1.sh +++ b/dygraph/mobilenet/run_mul_v1.sh @@ -1,2 +1,2 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3 -python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256 --reader_thread=8 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 +python3 -m paddle.distributed.launch --log_dir ./mylog.v1 train.py --use_data_parallel 1 --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 --model_save_dir=output.v1.mul/ --num_epochs=120 diff --git a/dygraph/mobilenet/run_mul_v1_checkpoint.sh b/dygraph/mobilenet/run_mul_v1_checkpoint.sh new file mode 100644 index 00000000..6b511f19 --- /dev/null +++ b/dygraph/mobilenet/run_mul_v1_checkpoint.sh @@ -0,0 +1,2 @@ +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 -m paddle.distributed.launch --log_dir ./mylog.v1.checkpoint train.py --use_data_parallel 1 --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 --model_save_dir=output.v1.mul.checkpoint/ --num_epochs=120 --checkpoint=./output.v1.mul/_mobilenet_v1_epoch50 diff --git a/dygraph/mobilenet/run_mul_v2.sh b/dygraph/mobilenet/run_mul_v2.sh index a3f9991e..485cad36 100644 --- a/dygraph/mobilenet/run_mul_v2.sh +++ b/dygraph/mobilenet/run_mul_v2.sh @@ -1,2 +1,2 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3 -python3 -m paddle.distributed.launch --log_dir ./mylog.time train.py --use_data_parallel 1 --batch_size=256 --reader_thread=8 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV2 +python3 -m paddle.distributed.launch --log_dir ./mylog.v2 train.py --use_data_parallel 1 --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2 diff --git a/dygraph/mobilenet/run_mul_v2_checkpoint.sh b/dygraph/mobilenet/run_mul_v2_checkpoint.sh new file mode 100644 index 00000000..2b1b5587 --- /dev/null +++ b/dygraph/mobilenet/run_mul_v2_checkpoint.sh @@ -0,0 +1,2 @@ +export CUDA_VISIBLE_DEVICES=0,1,2,3 +python3 -m paddle.distributed.launch --log_dir ./mylog.v2.checkpoint train.py --use_data_parallel 1 --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.mul.checkpoint/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2 --checkpoint=./output.v2.mul/_mobilenet_v2_epoch50 diff --git a/dygraph/mobilenet/run_sing_v1.sh b/dygraph/mobilenet/run_sing_v1.sh index 3e480faa..c4fef298 100644 --- a/dygraph/mobilenet/run_sing_v1.sh +++ b/dygraph/mobilenet/run_sing_v1.sh @@ -1,2 +1,2 @@ export CUDA_VISIBLE_DEVICES=0 -python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 +python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 diff --git a/dygraph/mobilenet/run_sing_v1_checkpoint.sh b/dygraph/mobilenet/run_sing_v1_checkpoint.sh new file mode 100644 index 00000000..47d68d96 --- /dev/null +++ b/dygraph/mobilenet/run_sing_v1_checkpoint.sh @@ -0,0 +1,2 @@ +export CUDA_VISIBLE_DEVICES=0 +python3 train.py --batch_size=256 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v1.sing/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=./data/ILSVRC2012 --l2_decay=3e-5 --model=MobileNetV1 --checkpoint=./output.v1.sing/_mobilenet_v1_epoch50 diff --git a/dygraph/mobilenet/run_sing_v2.sh b/dygraph/mobilenet/run_sing_v2.sh index 9db7a20f..f747ee5e 100644 --- a/dygraph/mobilenet/run_sing_v2.sh +++ b/dygraph/mobilenet/run_sing_v2.sh @@ -1,2 +1,2 @@ export CUDA_VISIBLE_DEVICES=0 -python3 train.py --batch_size=128 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output/ --lr_strategy=piecewise_decay --lr=0.1 --data_dir=../../PaddleCV/image_classification/data/ILSVRC2012 --model=MobileNetV2 +python3 train.py --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2 diff --git a/dygraph/mobilenet/run_sing_v2_checkpoint.sh b/dygraph/mobilenet/run_sing_v2_checkpoint.sh new file mode 100644 index 00000000..ed77b221 --- /dev/null +++ b/dygraph/mobilenet/run_sing_v2_checkpoint.sh @@ -0,0 +1,2 @@ +export CUDA_VISIBLE_DEVICES=0 +python3 train.py --batch_size=500 --total_images=1281167 --class_dim=1000 --image_shape=3,224,224 --model_save_dir=output.v2.sing/ --lr_strategy=cosine_decay --lr=0.1 --num_epochs=240 --data_dir=./data/ILSVRC2012 --l2_decay=4e-5 --model=MobileNetV2 --checkpoint=./output.v2.sing/_mobilenet_v2_epoch50 diff --git a/dygraph/mobilenet/train.py b/dygraph/mobilenet/train.py index 42648b3e..254279ba 100644 --- a/dygraph/mobilenet/train.py +++ b/dygraph/mobilenet/train.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,35 +12,24 @@ # See the License for the specific language governing permissions and # limitations under the License. -from mobilenet_v1 import * -from mobilenet_v2 import * +#order: standard library, third party, local library import os -import numpy as np import time import sys -import sys -import numpy as np +import math import argparse -import ast +import numpy as np import paddle import paddle.fluid as fluid from paddle.fluid.initializer import MSRA from paddle.fluid.param_attr import ParamAttr from paddle.fluid.layer_helper import LayerHelper -#from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, FC from paddle.fluid.dygraph.base import to_variable - from paddle.fluid import framework - -import math -import sys import reader from utils import * - -IMAGENET1000 = 1281167 -base_lr = 0.1 -momentum_rate = 0.9 -l2_decay = 1e-4 +from mobilenet_v1 import * +from mobilenet_v2 import * args = parse_args() if int(os.getenv("PADDLE_TRAINER_ID", 0)) == 0: @@ -56,7 +45,7 @@ def eval(net, test_data_loader, eop): for img, label in test_data_loader(): t1 = time.time() label = to_variable(label.numpy().astype('int64').reshape( - int(args.batch_size / paddle.fluid.core.get_cuda_device_count()), + int(args.batch_size // paddle.fluid.core.get_cuda_device_count()), 1)) out = net(img) softmax_out = fluid.layers.softmax(out, use_cudnn=False) @@ -80,10 +69,14 @@ def eval(net, test_data_loader, eop): def train_mobilenet(): - epoch = args.num_epochs - place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \ - if args.use_data_parallel else fluid.CUDAPlace(0) + if not args.use_gpu: + place = fluid.CPUPlace() + elif not args.use_data_parallel: + place = fluid.CUDAPlace(0) + else: + place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) with fluid.dygraph.guard(place): + # 1. init net and optimizer if args.ce: print("ce mode") seed = 33 @@ -93,13 +86,12 @@ def train_mobilenet(): if args.use_data_parallel: strategy = fluid.dygraph.parallel.prepare_context() - net = None if args.model == "MobileNetV1": - net = MobileNetV1(class_dim=args.class_dim) - para_name = 'mobilenet_v1_params' + net = MobileNetV1(class_dim=args.class_dim, scale=1.0) + model_path_pre = 'mobilenet_v1' elif args.model == "MobileNetV2": net = MobileNetV2(class_dim=args.class_dim, scale=1.0) - para_name = 'mobilenet_v2_params' + model_path_pre = 'mobilenet_v2' else: print( "wrong model name, please try model = MobileNetV1 or MobileNetV2" @@ -109,6 +101,18 @@ def train_mobilenet(): optimizer = create_optimizer(args=args, parameter_list=net.parameters()) if args.use_data_parallel: net = fluid.dygraph.parallel.DataParallel(net, strategy) + + # 2. load checkpoint + if args.checkpoint: + assert os.path.exists(args.checkpoint + ".pdparams"), \ + "Given dir {}.pdparams not exist.".format(args.checkpoint) + assert os.path.exists(args.checkpoint + ".pdopt"), \ + "Given dir {}.pdopt not exist.".format(args.checkpoint) + para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint) + net.set_dict(para_dict) + optimizer.set_dict(opti_dict) + + # 3. reader train_data_loader, train_data = utility.create_data_loader( is_train=True, args=args) test_data_loader, test_data = utility.create_data_loader( @@ -119,7 +123,9 @@ def train_mobilenet(): test_reader = imagenet_reader.val(settings=args) train_data_loader.set_sample_list_generator(train_reader, place) test_data_loader.set_sample_list_generator(test_reader, place) - for eop in range(epoch): + + # 4. train loop + for eop in range(args.num_epochs): if num_trainers > 1: imagenet_reader.set_shuffle_seed(eop + ( args.random_seed if args.random_seed else 0)) @@ -130,13 +136,17 @@ def train_mobilenet(): total_sample = 0 batch_id = 0 t_last = 0 + # 4.1 for each batch, call net() , backward(), and minimize() for img, label in train_data_loader(): t1 = time.time() label = to_variable(label.numpy().astype('int64').reshape( - int(args.batch_size / + int(args.batch_size // paddle.fluid.core.get_cuda_device_count()), 1)) t_start = time.time() + + # 4.1.1 call net() out = net(img) + t_end = time.time() softmax_out = fluid.layers.softmax(out, use_cudnn=False) loss = fluid.layers.cross_entropy( @@ -145,14 +155,20 @@ def train_mobilenet(): acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) t_start_back = time.time() + + # 4.1.2 call backward() if args.use_data_parallel: avg_loss = net.scale_loss(avg_loss) avg_loss.backward() net.apply_collective_grads() else: avg_loss.backward() + t_end_back = time.time() + + # 4.1.3 call minimize() optimizer.minimize(avg_loss) + net.clear_gradients() t2 = time.time() train_batch_elapse = t2 - t1 @@ -174,13 +190,31 @@ def train_mobilenet(): print("epoch %d | batch step %d, loss %0.3f acc1 %0.3f acc5 %0.3f %2.4f sec" % \ (eop, batch_id, total_loss / total_sample, \ total_acc1 / total_sample, total_acc5 / total_sample, train_batch_elapse)) - net.eval() - eval(net, test_data_loader, eop) + + # 4.2 save checkpoint save_parameters = (not args.use_data_parallel) or ( args.use_data_parallel and fluid.dygraph.parallel.Env().local_rank == 0) if save_parameters: - fluid.save_dygraph(net.state_dict(), para_name) + if not os.path.isdir(args.model_save_dir): + os.makedirs(args.model_save_dir) + model_path = os.path.join( + args.model_save_dir, "_" + model_path_pre + "_epoch{}".format(eop)) + fluid.dygraph.save_dygraph(net.state_dict(), model_path) + fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path) + + # 4.3 validation + net.eval() + eval(net, test_data_loader, eop) + + # 5. save final results + save_parameters = (not args.use_data_parallel) or ( + args.use_data_parallel and + fluid.dygraph.parallel.Env().local_rank == 0) + if save_parameters: + model_path = os.path.join( + args.model_save_dir, "_" + model_path_pre + "_final") + fluid.dygraph.save_dygraph(net.state_dict(), model_path) if __name__ == '__main__': -- GitLab