未验证 提交 4d7ec517 编写于 作者: S SunGaofeng 提交者: GitHub

add distributed training and gpu decoding and pre-processing of video (#4597)

* add distributed training and gpu decoding and pre-processing of video

* Add document for DALI installation
上级 27ab65ef
......@@ -99,12 +99,19 @@ python train.py --model_name=STNET \
bash run.sh train STNET ./configs/stnet.yaml
```
多卡分布式训练 + GPU视频解码和预处理(仅限TSN模型)
``` bash
bash run_dist.sh train TSN ./configs/tsn_dist_and_dali.yaml
```
- 请根据`CUDA_VISIBLE_DEVICES`指定卡数修改`config`文件中的`num_gpus``batch_size`配置。
- 使用CPU训练时请在run.sh中设置use\_gpu=False,使用GPU训练时则设置use\_gpu=True
- 上述启动脚本run.sh运行时需要指定任务类型、模型名、配置文件。训练、评估和预测对应的任务类型分别是train,eval和predict。模型名称则是[AttentionCluster, AttentionLSTM, NEXTVLAD, NONLOCAL, STNET, TSN, TSM, CTCN]中的任何一个。配置文件全部在PaddleVideo/configs目录下,根据模型名称选择对应的配置文件即可。具体使用请参见各模型的说明文档。
- 目前针对TSN模型,做了GPU解码和数据预处理的优化,能明显提升训练速度,具体请参考[TSN](./models/tsn/README.md)
## 模型库结构
......
MODEL:
name: "TSN"
format: "pkl"
num_classes: 400
seg_num: 3
seglen: 1
image_mean: [0.485, 0.456, 0.406]
image_std: [0.229, 0.224, 0.225]
num_layers: 50
topk: 5
TRAIN:
epoch: 45
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 256
use_gpu: True
num_gpus: 8
filelist: "./data/dataset/kinetics/train_video_file.list"
learning_rate: 0.01
learning_rate_decay: 0.1
l2_weight_decay: 1e-4
momentum: 0.9
total_videos: 224684
num_trainers: 1 # this will be determined by fleet implicitly, no need to set
trainer_id: 0 # this will be determined by fleet implicitly, no need to set
use_dali: True
VALID:
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 256
filelist: "./data/dataset/kinetics/val_video_file.list"
TEST:
seg_num: 7
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 16
filelist: "./data/dataset/kinetics/test_video_file.list"
use_dali: True
INFER:
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 1
filelist: "./data/dataset/kinetics/infer_video_file.list"
video_path: ""
kinetics_labels: "./data/dataset/kinetics_labels.json"
......@@ -81,6 +81,7 @@ def test(args):
config = parse_config(args.config)
test_config = merge_configs(config, 'test', vars(args))
print_configs(test_config, "Test")
use_dali = test_config['TEST'].get('use_dali', False)
# build model
test_model = models.get_model(args.model_name, test_config, mode='test')
......@@ -127,6 +128,10 @@ def test(args):
feed=test_feeder.feed(feat_data),
return_numpy=True)
test_outs += [vinfo]
elif args.model_name == 'TSN' and use_dali:
test_outs = exe.run(fetch_list=test_fetch_list,
feed={'image': data[0],
'label': data[1]})
else:
test_outs = exe.run(fetch_list=test_fetch_list,
feed=test_feeder.feed(data))
......
......@@ -54,6 +54,18 @@ TSN的训练数据采用由DeepMind公布的Kinetics-400动作识别数据集。
* 权重衰减系数为1e-4
* 学习率在训练的总epoch数的1/3和2/3时分别做0.1的衰减
**训练速度优化:**
* 使用GPU解码优化视频源文件读取和预处理速度,需要预先安装NVIDIA/DALI
* DALI的安装方式请参考NVIDIA/DALI[官方文档](https://docs.nvidia.com/deeplearning/sdk/dali-developer-guide/docs/compilation.html#)。由于NVIDIA/DALI提供的VideoReader OP不支持TSN模型的采样方式,请使用[SunGaofeng/DALI](https://github.com/SunGaofeng/DALI)提供的源码,提供了时间维度的稀疏采样方式
* 使用分布式训练的方式提升多卡加速比
启动脚本为:
``` bash
bash run_dist.sh train TSN ./configs/tsn_dist_and_dali.yaml
```
## 模型评估
可通过如下两种方式进行模型评估:
......
......@@ -26,6 +26,16 @@ except ImportError:
from io import BytesIO
import numpy as np
import paddle
try:
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
import tempfile
from nvidia.dali.plugin.paddle import DALIGenericIterator
except:
print("DALI is not installed, you can improve performance if use DALI")
from PIL import Image, ImageEnhance
import logging
......@@ -76,6 +86,13 @@ class KineticsReader(DataReader):
# set batch size and file list
self.batch_size = cfg[mode.upper()]['batch_size']
self.filelist = cfg[mode.upper()]['filelist']
# set num_trainers and trainer_id when distributed training is implemented
self.num_trainers = self.get_config_from_sec(mode, 'num_trainers', 1)
self.trainer_id = self.get_config_from_sec(mode, 'trainer_id', 0)
self.use_dali = self.get_config_from_sec(mode, 'use_dali', False)
self.dali_mean = cfg.MODEL.image_mean * (self.seg_num * self.seglen)
self.dali_std = cfg.MODEL.image_std * (self.seg_num * self.seglen)
if self.mode == 'infer':
self.video_path = cfg[mode.upper()]['video_path']
else:
......@@ -86,6 +103,10 @@ class KineticsReader(DataReader):
self.num_reader_threads = 1
def create_reader(self):
# if use_dali to improve performance
if self.use_dali:
return self.build_dali_reader()
# if set video_path for inference mode, just load this single video
if (self.mode == 'infer') and (self.video_path != ''):
# load video from file stored at video_path
......@@ -201,11 +222,35 @@ class KineticsReader(DataReader):
return imgs_transform(imgs, mode, seg_num, seglen, \
short_size, target_size, img_mean, img_std, name = self.name), ret_label
def reader():
def reader_():
with open(pickle_list) as flist:
lines = [line.strip() for line in flist]
if shuffle:
random.shuffle(lines)
full_lines = [line.strip() for line in flist]
if self.mode == 'train':
if (not hasattr(reader_, 'seed')):
reader_.seed = 0
random.Random(reader_.seed).shuffle(full_lines)
print("reader shuffle seed", reader_.seed)
if reader_.seed is not None:
reader_.seed += 1
per_node_lines = int(
math.ceil(len(full_lines) * 1.0 / self.num_trainers))
total_lines = per_node_lines * self.num_trainers
# aligned full_lines so that it can evenly divisible
full_lines += full_lines[:(total_lines - len(full_lines))]
assert len(full_lines) == total_lines
# trainer get own sample
lines = full_lines[self.trainer_id:total_lines:
self.num_trainers]
logger.info("trainerid %d, trainer_count %d" %
(self.trainer_id, self.num_trainers))
logger.info(
"read images from %d, length: %d, lines length: %d, total: %d"
% (self.trainer_id * per_node_lines, per_node_lines,
len(lines), len(full_lines)))
assert len(lines) == per_node_lines
for line in lines:
pickle_path = line.strip()
yield [pickle_path]
......@@ -227,7 +272,251 @@ class KineticsReader(DataReader):
img_mean=img_mean,
img_std=img_std)
return paddle.reader.xmap_readers(mapper, reader, num_threads, buf_size)
return paddle.reader.xmap_readers(mapper, reader_, num_threads,
buf_size)
def build_dali_reader(self):
"""
build dali training reader
"""
def reader_():
with open(self.filelist) as flist:
full_lines = [line for line in flist]
if self.mode == 'train':
if (not hasattr(reader_, 'seed')):
reader_.seed = 0
random.Random(reader_.seed).shuffle(full_lines)
print("reader shuffle seed", reader_.seed)
if reader_.seed is not None:
reader_.seed += 1
per_node_lines = int(
math.ceil(len(full_lines) * 1.0 / self.num_trainers))
total_lines = per_node_lines * self.num_trainers
# aligned full_lines so that it can evenly divisible
full_lines += full_lines[:(total_lines - len(full_lines))]
assert len(full_lines) == total_lines
# trainer get own sample
lines = full_lines[self.trainer_id:total_lines:
self.num_trainers]
assert len(lines) == per_node_lines
logger.info("trainerid %d, trainer_count %d" %
(self.trainer_id, self.num_trainers))
logger.info(
"read images from %d, length: %d, lines length: %d, total: %d"
% (self.trainer_id * per_node_lines, per_node_lines,
len(lines), len(full_lines)))
video_files = ''
for item in lines:
video_files += item
tf = tempfile.NamedTemporaryFile()
tf.write(str.encode(video_files))
tf.flush()
video_files = tf.name
device_id = int(os.getenv('FLAGS_selected_gpus', 0))
print('---------- device id -----------', device_id)
if self.mode == 'train':
pipe = VideoPipe(
batch_size=self.batch_size,
num_threads=1,
device_id=device_id,
file_list=video_files,
sequence_length=self.seg_num * self.seglen,
seg_num=self.seg_num,
seg_length=self.seglen,
resize_shorter_scale=self.short_size,
crop_target_size=self.target_size,
is_training=(self.mode == 'train'),
dali_mean=self.dali_mean,
dali_std=self.dali_std)
else:
pipe = VideoTestPipe(
batch_size=self.batch_size,
num_threads=1,
device_id=device_id,
file_list=video_files,
sequence_length=self.seg_num * self.seglen,
seg_num=self.seg_num,
seg_length=self.seglen,
resize_shorter_scale=self.short_size,
crop_target_size=self.target_size,
is_training=(self.mode == 'train'),
dali_mean=self.dali_mean,
dali_std=self.dali_std)
logger.info(
'initializing dataset, it will take several minutes if it is too large .... '
)
video_loader = DALIGenericIterator(
[pipe], ['image', 'label'],
len(lines),
dynamic_shape=True,
auto_reset=True)
return video_loader
dali_reader = reader_()
def ret_reader():
for data in dali_reader:
yield data[0]['image'], data[0]['label']
return ret_reader
class VideoPipe(Pipeline):
def __init__(self,
batch_size,
num_threads,
device_id,
file_list,
sequence_length,
seg_num,
seg_length,
resize_shorter_scale,
crop_target_size,
is_training=False,
initial_prefetch_size=10,
num_shards=1,
shard_id=0,
dali_mean=0.,
dali_std=1.0):
super(VideoPipe, self).__init__(batch_size, num_threads, device_id)
self.input = ops.VideoReader(
device="gpu",
file_list=file_list,
sequence_length=sequence_length,
seg_num=seg_num,
seg_length=seg_length,
is_training=is_training,
num_shards=num_shards,
shard_id=shard_id,
random_shuffle=is_training,
initial_fill=initial_prefetch_size)
# the sequece data read by ops.VideoReader is of shape [F, H, W, C]
# Because the ops.Resize does not support sequence data,
# it will be transposed into [H, W, F, C],
# then reshaped to [H, W, FC], and then resized like a 2-D image.
self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
self.reshape = ops.Reshape(
device="gpu", rel_shape=[1.0, 1.0, -1], layout='HWC')
self.resize = ops.Resize(
device="gpu", resize_shorter=resize_shorter_scale)
# crops and mirror are applied by ops.CropMirrorNormalize.
# Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
# It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))
self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))
self.mirror_generator = ops.Uniform(range=(0.0, 1.0))
self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)
self.crop_mirror_norm = ops.CropMirrorNormalize(
device="gpu",
crop=[crop_target_size, crop_target_size],
mean=dali_mean,
std=dali_std)
self.reshape_back = ops.Reshape(
device="gpu",
shape=[
seg_num, seg_length * 3, crop_target_size, crop_target_size
],
layout='FCHW')
self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
def define_graph(self):
output, label = self.input(name="Reader")
output = self.transpose(output)
output = self.reshape(output)
output = self.resize(output)
output = output / 255.
pos_x = self.pos_rng_x()
pos_y = self.pos_rng_y()
mirror_flag = self.mirror_generator()
mirror_flag = (mirror_flag > 0.5)
mirror_flag = self.cast_mirror(mirror_flag)
#output = self.crop(output, crop_pos_x=pos_x, crop_pos_y=pos_y)
output = self.crop_mirror_norm(
output, crop_pos_x=pos_x, crop_pos_y=pos_y, mirror=mirror_flag)
output = self.reshape_back(output)
label = self.cast_label(label)
return output, label
class VideoTestPipe(Pipeline):
def __init__(self,
batch_size,
num_threads,
device_id,
file_list,
sequence_length,
seg_num,
seg_length,
resize_shorter_scale,
crop_target_size,
is_training=False,
initial_prefetch_size=10,
num_shards=1,
shard_id=0,
dali_mean=0.,
dali_std=1.0):
super(VideoTestPipe, self).__init__(batch_size, num_threads, device_id)
self.input = ops.VideoReader(
device="gpu",
file_list=file_list,
sequence_length=sequence_length,
seg_num=seg_num,
seg_length=seg_length,
is_training=is_training,
num_shards=num_shards,
shard_id=shard_id,
random_shuffle=is_training,
initial_fill=initial_prefetch_size)
# the sequece data read by ops.VideoReader is of shape [F, H, W, C]
# Because the ops.Resize does not support sequence data,
# it will be transposed into [H, W, F, C],
# then reshaped to [H, W, FC], and then resized like a 2-D image.
self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
self.reshape = ops.Reshape(
device="gpu", rel_shape=[1.0, 1.0, -1], layout='HWC')
self.resize = ops.Resize(
device="gpu", resize_shorter=resize_shorter_scale)
# crops and mirror are applied by ops.CropMirrorNormalize.
# Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
# It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
self.crop_mirror_norm = ops.CropMirrorNormalize(
device="gpu",
crop=[crop_target_size, crop_target_size],
crop_pos_x=0.5,
crop_pos_y=0.5,
mirror=0,
mean=dali_mean,
std=dali_std)
self.reshape_back = ops.Reshape(
device="gpu",
shape=[
seg_num, seg_length * 3, crop_target_size, crop_target_size
],
layout='FCHW')
self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
def define_graph(self):
output, label = self.input(name="Reader")
output = self.transpose(output)
output = self.reshape(output)
output = self.resize(output)
output = output / 255.
#output = self.crop(output, crop_pos_x=pos_x, crop_pos_y=pos_y)
output = self.crop_mirror_norm(output)
output = self.reshape_back(output)
label = self.cast_label(label)
return output, label
def imgs_transform(imgs,
......
# examples of running programs:
# bash ./run.sh train CTCN ./configs/ctcn.yaml
# bash ./run.sh eval NEXTVLAD ./configs/nextvlad.yaml
# bash ./run.sh predict NONLOCAL ./cofings/nonlocal.yaml
# mode should be one of [train, eval, predict, inference]
# name should be one of [AttentionCluster, AttentionLSTM, NEXTVLAD, NONLOCAL, TSN, TSM, STNET, CTCN]
# configs should be ./configs/xxx.yaml
mode=$1
name=$2
configs=$3
pretrain="" # set pretrain model path if needed
resume="" # set pretrain model path if needed
save_dir="./data/checkpoints"
save_inference_dir="./data/inference_model"
use_gpu=True
fix_random_seed=False
log_interval=1
valid_interval=1
weights="" #set the path of weights to enable eval and predicut, just ignore this when training
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
#export CUDA_VISIBLE_DEVICES=0,1,2,3
#export CUDA_VISIBLE_DEVICES=0
export FLAGS_fast_eager_deletion_mode=1
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.98
if [ "$mode"x == "train"x ]; then
echo $mode $name $configs $resume $pretrain
if [ "$resume"x != ""x ]; then
python -m paddle.distributed.launch --log_dir=log \
train_dist.py --model_name=$name \
--config=$configs \
--resume=$resume \
--log_interval=$log_interval \
--valid_interval=$valid_interval \
--use_gpu=$use_gpu \
--save_dir=$save_dir \
--fix_random_seed=$fix_random_seed
elif [ "$pretrain"x != ""x ]; then
python -m paddle.distributed.launch --log_dir=log \
train_dist.py --model_name=$name \
--config=$configs \
--pretrain=$pretrain \
--log_interval=$log_interval \
--valid_interval=$valid_interval \
--use_gpu=$use_gpu \
--save_dir=$save_dir \
--fix_random_seed=$fix_random_seed
else
python -m paddle.distributed.launch --log_dir=log \
train_dist.py --model_name=$name \
--config=$configs \
--log_interval=$log_interval \
--valid_interval=$valid_interval \
--use_gpu=$use_gpu \
--save_dir=$save_dir \
--fix_random_seed=$fix_random_seed
fi
else
echo "Not implemented mode " $mode
fi
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import sys
import time
import argparse
import ast
import logging
import numpy as np
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.collective import fleet, DistributedStrategy
import paddle.fluid.incubate.fleet.base.role_maker as role_maker
from utils.train_utils import train_with_dataloader
import models
from utils.config_utils import *
from reader import get_reader
from metrics import get_metrics
from utils.utility import check_cuda
from utils.utility import check_version
logging.root.handlers = []
FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT, stream=sys.stdout)
logger = logging.getLogger(__name__)
def parse_args():
parser = argparse.ArgumentParser("Paddle Video train script")
parser.add_argument(
'--model_name',
type=str,
default='AttentionCluster',
help='name of model to train.')
parser.add_argument(
'--config',
type=str,
default='configs/tsn_dist_and_dali.yaml',
help='path to config file of model')
parser.add_argument(
'--batch_size',
type=int,
default=None,
help='training batch size. None to use config file setting.')
parser.add_argument(
'--learning_rate',
type=float,
default=None,
help='learning rate use for training. None to use config file setting.')
parser.add_argument(
'--pretrain',
type=str,
default=None,
help='path to pretrain weights. None to use default weights path in ~/.paddle/weights.'
)
parser.add_argument(
'--resume',
type=str,
default=None,
help='path to resume training based on previous checkpoints. '
'None for not resuming any checkpoints.')
parser.add_argument(
'--use_gpu',
type=ast.literal_eval,
default=True,
help='default use gpu.')
parser.add_argument(
'--no_memory_optimize',
action='store_true',
default=False,
help='whether to use memory optimize in train')
parser.add_argument(
'--epoch',
type=int,
default=None,
help='epoch number, 0 for read from config file')
parser.add_argument(
'--valid_interval',
type=int,
default=1,
help='validation epoch interval, 0 for no validation.')
parser.add_argument(
'--save_dir',
type=str,
default=os.path.join('data', 'checkpoints'),
help='directory name to save train snapshoot')
parser.add_argument(
'--log_interval',
type=int,
default=10,
help='mini-batch interval to log.')
parser.add_argument(
'--fix_random_seed',
type=ast.literal_eval,
default=False,
help='If set True, enable continuous evaluation job.')
# NOTE: args for profiler, used for benchmark
parser.add_argument(
'--profiler_path',
type=str,
default='./',
help='the path to store profiler output file. used for benchmark.')
parser.add_argument(
'--is_profiler',
type=int,
default=0,
help='the switch profiler. used for benchmark.')
parser.add_argument(
'--num_trainers',
type=int,
default=1,
help='the number of trainers when used in distributed training. No need to set this, it will be set automatically'
)
parser.add_argument(
'--trainer_id',
type=int,
default=0,
help='trainer id when used in distributed training. No need to set this, it will be set automatically'
)
args = parser.parse_args()
return args
def train(args):
# implement distributed training by fleet
use_fleet = True
if use_fleet:
role = role_maker.PaddleCloudRoleMaker(is_collective=True)
fleet.init(role)
args.num_trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
print('-------------', args.num_trainers, args.trainer_id)
if args.trainer_id == 0:
if not os.path.exists(args.save_dir):
os.makedirs(args.save_dir)
# parse config
config = parse_config(args.config)
train_config = merge_configs(config, 'train', vars(args))
print_configs(train_config, 'Train')
train_model = models.get_model(args.model_name, train_config, mode='train')
# build model
startup = fluid.Program()
train_prog = fluid.Program()
if args.fix_random_seed:
startup.random_seed = 1000
train_prog.random_seed = 1000
with fluid.program_guard(train_prog, startup):
with fluid.unique_name.guard():
train_model.build_input(use_dataloader=True)
train_model.build_model()
# for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label
train_feeds = train_model.feeds()
train_fetch_list = train_model.fetches()
train_loss = train_fetch_list[0]
optimizer = train_model.optimizer()
if use_fleet:
optimizer = fleet.distributed_optimizer(optimizer)
optimizer.minimize(train_loss)
train_dataloader = train_model.dataloader()
gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0))
place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(startup)
if args.resume:
# if resume weights is given, load resume weights directly
assert os.path.exists(args.resume + '.pdparams'), \
"Given resume weight dir {}.pdparams not exist.".format(args.resume)
fluid.load(train_prog, model_path=args.resume, executor=exe)
else:
# if not in resume mode, load pretrain weights
if args.pretrain:
assert os.path.exists(args.pretrain), \
"Given pretrain weight dir {} not exist.".format(args.pretrain)
pretrain = args.pretrain or train_model.get_pretrain_weights()
if pretrain:
train_model.load_pretrain_params(exe, pretrain, train_prog, place)
build_strategy = fluid.BuildStrategy()
build_strategy.enable_inplace = True
if args.model_name in ['CTCN']:
build_strategy.enable_sequential_execution = True
exec_strategy = fluid.ExecutionStrategy()
if use_fleet:
compiled_train_prog = fleet.main_program
else:
compiled_train_prog = fluid.compiler.CompiledProgram(
train_prog).with_data_parallel(
loss_name=train_loss.name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
# get reader
bs_denominator = 1
if args.use_gpu:
# check number of GPUs
gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
if gpus == "":
pass
else:
gpus = gpus.split(",")
num_gpus = len(gpus)
assert num_gpus == train_config.TRAIN.num_gpus, \
"num_gpus({}) set by CUDA_VISIBLE_DEVICES " \
"shoud be the same as that " \
"set in {}({})".format(
num_gpus, args.config, train_config.TRAIN.num_gpus)
bs_denominator = train_config.TRAIN.num_gpus
train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
bs_denominator)
train_reader = get_reader(args.model_name.upper(), 'train', train_config)
# get metrics
train_metrics = get_metrics(args.model_name.upper(), 'train', train_config)
epochs = args.epoch or train_model.epoch_num()
exe_places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places()
train_dataloader.set_batch_generator(train_reader, places=place)
train_with_dataloader(
exe,
train_prog,
compiled_train_prog,
train_dataloader,
train_fetch_list,
train_metrics,
epochs=epochs,
log_interval=args.log_interval,
save_dir=args.save_dir,
num_trainers=args.num_trainers,
trainer_id=args.trainer_id,
save_model_name=args.model_name,
fix_random_seed=args.fix_random_seed,
is_profiler=args.is_profiler,
profiler_path=args.profiler_path)
if __name__ == "__main__":
args = parse_args()
# check whether the installed paddle is compiled with GPU
check_cuda(args.use_gpu)
check_version()
logger.info(args)
train(args)
......@@ -75,6 +75,7 @@ def test_with_dataloader(exe,
def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, \
train_fetch_list, train_metrics, epochs = 10, \
log_interval = 0, valid_interval = 0, save_dir = './', \
num_trainers = 1, trainer_id = 0, \
save_model_name = 'model', fix_random_seed = False, \
compiled_test_prog = None, test_dataloader = None, \
test_fetch_list = None, test_metrics = None, \
......@@ -89,17 +90,21 @@ def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader
train_iter = 0
epoch_periods = []
cur_time = time.time()
for data in train_dataloader():
cur_time = time.time()
train_outs = exe.run(compiled_train_prog,
fetch_list=train_fetch_list,
feed=data)
period = time.time() - cur_time
epoch_periods.append(period)
timeStamp = time.time()
localTime = time.localtime(timeStamp)
strTime = time.strftime("%Y-%m-%d %H:%M:%S", localTime)
if log_interval > 0 and (train_iter % log_interval == 0):
train_metrics.calculate_and_log_out(train_outs, \
info = '[TRAIN] Epoch {}, iter {} '.format(epoch, train_iter))
info = '[TRAIN {}] Epoch {}, iter {}, time {}, '.format(strTime, epoch, train_iter, period))
train_iter += 1
cur_time = time.time()
# NOTE: profiler tools, used for benchmark
if is_profiler and epoch == 0 and train_iter == log_interval:
......@@ -115,15 +120,18 @@ def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader
logger.info('[TRAIN] Epoch {} training finished, average time: {}'.
format(epoch, np.mean(epoch_periods[1:])))
save_model(exe, train_prog, save_dir, save_model_name,
"_epoch{}".format(epoch))
if trainer_id == 0:
save_model(exe, train_prog, save_dir, save_model_name,
"_epoch{}".format(epoch))
if compiled_test_prog and valid_interval > 0 and (
epoch + 1) % valid_interval == 0:
test_with_dataloader(exe, compiled_test_prog, test_dataloader,
test_fetch_list, test_metrics, log_interval,
save_model_name)
save_model(exe, train_prog, save_dir, save_model_name)
if trainer_id == 0:
save_model(exe, train_prog, save_dir, save_model_name)
#when fix_random seed for debug
if fix_random_seed:
cards = os.environ.get('CUDA_VISIBLE_DEVICES')
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册