未验证 提交 e3201306 编写于 作者: C chajchaj 提交者: GitHub

support data_parallel training and ucf101 dataset (#4819)

上级 2c8b76b1
......@@ -6,44 +6,61 @@
## 内容
- [模型简介](#模型简介)
- [安装说明](#安装说明)
- [数据准备](#数据准备)
- [模型训练](#模型训练)
- [模型评估](#模型评估)
## 模型简介
Temporal Shift Module是由MIT和IBM Watson AI Lab的Ji Lin,Chuang Gan和Song Han等人提出的通过时间位移来提高网络视频理解能力的模块, 详细内容请参考论文[Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383v1)
## 数据准备
## 安装说明
1. 在当前模型库运行样例代码需要PaddlePaddle v.2.0.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本,请根据[安装文档](http://www.paddlepaddle.org/documentation/docs/zh/1.6/beginners_guide/install/index_cn.html)中的说明来更新PaddlePaddle。
2. 下载模型repo: git clone https://github.com/PaddlePaddle/models
### 其他环境依赖
- Python >= 3.7
TSM的训练数据采用由DeepMind公布的Kinetics-400动作识别数据集。数据下载及准备请参考[数据说明](data/dataset/README.md)
- CUDA >= 8.0
### 小数据集验证
- CUDNN >= 7.0
为了便于快速迭代,我们采用了较小的数据集进行动态图训练验证,分别进行了两组实验验证:
1. 其中包括8k大小的训练数据和2k大小的测试数据。
2. 其中包括了十类大小的训练数据和测试数据。
## 数据准备
TSM的训练数据采用UCF101行为识别数据集,包含101个行为类别。
ucf101_reader.py文件中的ucf101_root设置为ucf101数据集目录,其中的videos、rawframes分别为视频格式和帧图格式,大小分别为6.8G、56G。
准备数据步骤:
1. 下载官方ucf101数据: wget https://www.crcv.ucf.edu/data/UCF101/UCF101.rar, 解压存放到$ucf101_root/videos
2. 提取视频frames文件(TODO),存放到$ucf101_root/frames
3. 生成video文件路径list文件(步骤TODO),存放到./data/dataset/ucf101/
## 模型训练
数据准备完毕后,可以通过如下方式启动训练:
数据准备完毕后,可以通过如下方式启动训练.
- 从头开始训练
sh run_ucf101.sh
bash run.sh train
- 基于imagenet pretrain的resnet backbone参数进行训练:
## 模型评估
1. 需要加载在ImageNet上训练的ResNet50权重作为初始化参数,wget https://paddlemodels.bj.bcebos.com/video_classification/ResNet50_pretrained.tar.gz, 并解压
2. 通过--weights=./ResNet50_pretrained/启动训练: sh run_ucf101_imagenet.sh
数据准备完毕后,可以通过如下方式启动训练:
- 基于k400 pretrain模型进行finetune:
bash run.sh eval
1. 下载静态图已发布模型 wget https://paddlemodels.bj.bcebos.com/video_classification/TSM.pdparams
2. mkdir k400_wei && mv TSM.pdparams k400_wei
3. 通过--weights=k400_wei/TSM.pdparams启动训练: sh run_ucf101_k400.sh
从Kinetics400选取的十类的数据集下:
UCF101数据集下:
|Top-1|Top-5|
|:-:|:-:|
|76.56%|98.1%|
|Top-1|Top-5|pretrain|
|:-:|:-:|:-:|
|84.37%|95.68%|ImageNet|
|94.54%|98.96%|Kinetics-400|
全量数据集精度
Top-1 0.70
请参考:[静态图](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/PaddleVideo)
......@@ -16,6 +16,7 @@ import os
import time
import sys
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.layer_helper import LayerHelper
from paddle.fluid.dygraph.nn import Conv2D, Pool2D, BatchNorm, Linear
import math
......@@ -28,7 +29,8 @@ class ConvBNLayer(fluid.dygraph.Layer):
filter_size,
stride=1,
groups=1,
act=None):
act=None,
name=None):
super(ConvBNLayer, self).__init__()
self._conv = Conv2D(
......@@ -39,14 +41,22 @@ class ConvBNLayer(fluid.dygraph.Layer):
padding=(filter_size - 1) // 2,
groups=None,
act=None,
param_attr=fluid.param_attr.ParamAttr(),
param_attr=fluid.param_attr.ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
self._batch_norm = BatchNorm(
num_filters,
act=act,
param_attr=fluid.param_attr.ParamAttr(),
bias_attr=fluid.param_attr.ParamAttr())
param_attr=ParamAttr(
name=bn_name + "_scale"), #fluid.param_attr.ParamAttr(),
bias_attr=ParamAttr(bn_name +
"_offset"), #fluid.param_attr.ParamAttr())
moving_mean_name=bn_name + "_mean",
moving_variance_name=bn_name + "_variance")
def forward(self, inputs):
y = self._conv(inputs)
......@@ -61,32 +71,36 @@ class BottleneckBlock(fluid.dygraph.Layer):
num_filters,
stride,
shortcut=True,
seg_num=8):
seg_num=8,
name=None):
super(BottleneckBlock, self).__init__()
self.conv0 = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters,
filter_size=1,
act='relu')
act='relu',
name=name + "_branch2a")
self.conv1 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu')
act='relu',
name=name + "_branch2b")
self.conv2 = ConvBNLayer(
num_channels=num_filters,
num_filters=num_filters * 4,
filter_size=1,
act=None)
act=None,
name=name + "_branch2c")
if not shortcut:
self.short = ConvBNLayer(
num_channels=num_channels,
num_filters=num_filters * 4,
filter_size=1,
stride=stride)
stride=stride,
name=name + "_branch1")
self.shortcut = shortcut
self.seg_num = seg_num
self._num_channels_out = int(num_filters * 4)
......@@ -119,7 +133,12 @@ class TSM_ResNet(fluid.dygraph.Layer):
num_filters = [64, 128, 256, 512]
self.conv = ConvBNLayer(
num_channels=3, num_filters=64, filter_size=7, stride=2, act='relu')
num_channels=3,
num_filters=64,
filter_size=7,
stride=2,
act='relu',
name="conv1")
self.pool2d_max = Pool2D(
pool_size=3, pool_stride=2, pool_padding=1, pool_type='max')
......@@ -129,14 +148,23 @@ class TSM_ResNet(fluid.dygraph.Layer):
for block in range(len(depth)):
shortcut = False
for i in range(depth[block]):
if self.layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
bottleneck_block = self.add_sublayer(
'bb_%d_%d' % (block, i),
conv_name,
BottleneckBlock(
num_channels=num_channels,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
shortcut=shortcut,
seg_num=self.seg_num))
seg_num=self.seg_num,
name=conv_name))
num_channels = int(bottleneck_block._num_channels_out)
self.bottleneck_block_list.append(bottleneck_block)
shortcut = True
......@@ -151,9 +179,12 @@ class TSM_ResNet(fluid.dygraph.Layer):
self.class_dim,
act="softmax",
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv, stdv)),
initializer=fluid.initializer.Uniform(-stdv, stdv),
name="fc_0.w_0"),
bias_attr=fluid.param_attr.ParamAttr(
learning_rate=2.0, regularizer=fluid.regularizer.L2Decay(0.)))
learning_rate=2.0,
regularizer=fluid.regularizer.L2Decay(0.),
name="fc_0.b_0"))
def forward(self, inputs):
y = fluid.layers.reshape(
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import pickle
import cv2
import numpy as np
import random
class ReaderNotFoundError(Exception):
"Error: reader not found"
def __init__(self, reader_name, avail_readers):
super(ReaderNotFoundError, self).__init__()
self.reader_name = reader_name
self.avail_readers = avail_readers
def __str__(self):
msg = "Reader {} Not Found.\nAvailiable readers:\n".format(
self.reader_name)
for reader in self.avail_readers:
msg += " {}\n".format(reader)
return msg
class DataReader(object):
"""data reader for video input"""
def __init__(self, model_name, mode, cfg):
self.name = model_name
self.mode = mode
self.cfg = cfg
def create_reader(self):
"""Not implemented"""
pass
def get_config_from_sec(self, sec, item, default=None):
if sec.upper() not in self.cfg:
return default
return self.cfg[sec.upper()].get(item, default)
class ReaderZoo(object):
def __init__(self):
self.reader_zoo = {}
def regist(self, name, reader):
assert reader.__base__ == DataReader, "Unknow model type {}".format(
type(reader))
self.reader_zoo[name] = reader
def get(self, name, mode, cfg):
for k, v in self.reader_zoo.items():
if k == name:
return v(name, mode, cfg)
raise ReaderNotFoundError(name, self.reader_zoo.keys())
# singleton reader_zoo
reader_zoo = ReaderZoo()
def regist_reader(name, reader):
reader_zoo.regist(name, reader)
def get_reader(name, mode, cfg):
reader_model = reader_zoo.get(name, mode, cfg)
return reader_model.create_reader()
CUDA_VISIBLE_DEVICES=0,1,2,3 python3.7 -m paddle.distributed.launch --started_port 38989 --log_dir ./mylog.ucf101.frames tsm.py --config=./tsm_ucf101.yaml --use_gpu=True --use_data_parallel=True
CUDA_VISIBLE_DEVICES=0,1,2,3 python3.7 -m paddle.distributed.launch --started_port 18989 --log_dir ./mylog.ucf101.frames.imagenet train.py --config=./tsm_ucf101.yaml --use_gpu=True --use_data_parallel=True --weights=./ResNet50_pretrained/
CUDA_VISIBLE_DEVICES=4,5,6,7 python3.7 -m paddle.distributed.launch --started_port 38989 --log_dir ./mylog.ucf101.frames.k400 train.py --config=./tsm_ucf101.yaml --use_gpu=True --use_data_parallel=True --weights=k400_wei/TSM.pdparams
CUDA_VISIBLE_DEVICES=1 python3.7 -m paddle.distributed.launch --started_port 38989 --log_dir ./mylog.ucf101.frames.k400.sing train.py --config=./tsm_ucf101_sing.yaml --use_gpu=True --use_data_parallel=False --weights=k400_wei/TSM.pdparams
......@@ -24,6 +24,7 @@ from paddle.fluid.dygraph.base import to_variable
from model import TSM_ResNet
from config_utils import *
from reader import KineticsReader
from ucf101_reader import UCF101Reader
logging.root.handlers = []
FORMAT = '[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
......@@ -65,12 +66,39 @@ def parse_args():
type=int,
default=None,
help='epoch number, 0 for read from config file')
parser.add_argument(
'--use_data_parallel',
type=ast.literal_eval,
default=True,
help='default use data parallel.')
parser.add_argument(
'--model_save_dir',
type=str,
default='./output',
help='default model save in ./output.')
parser.add_argument(
'--checkpoint',
type=str,
default=None,
help='path to resume training based on previous checkpoints. '
'None for not resuming any checkpoints.')
parser.add_argument(
'--model_path_pre',
type=str,
default='tsm',
help='default model path pre is tsm.')
parser.add_argument(
'--weights',
type=str,
default='./ResNet50_pretrained/',
help='default weights is ./ResNet50_pretrained/.')
args = parser.parse_args()
return args
def val(epoch, model, cfg, args):
reader = KineticsReader(mode="valid", cfg=cfg)
reader = UCF101Reader(name="TSM", mode="valid", cfg=cfg)
reader = reader.create_reader()
total_loss = 0.0
total_acc1 = 0.0
......@@ -101,9 +129,9 @@ def val(epoch, model, cfg, args):
epoch, batch_id,
avg_loss.numpy()[0], acc_top1.numpy()[0], acc_top5.numpy()[0]))
print('Finish loss {} , acc1 {} , acc5 {}'.format(
total_loss / total_sample, total_acc1 / total_sample, total_acc5 /
total_sample))
print('TEST Epoch {}, iter {}, Finish loss {} , acc1 {} , acc5 {}'.format(
epoch, batch_id, total_loss / total_sample, total_acc1 / total_sample,
total_acc5 / total_sample))
def create_optimizer(cfg, params):
......@@ -132,26 +160,66 @@ def train(args):
valid_config = merge_configs(config, 'valid', vars(args))
print_configs(train_config, 'Train')
use_data_parallel = False
local_rank = fluid.dygraph.parallel.Env().local_rank
use_data_parallel = args.use_data_parallel
trainer_count = fluid.dygraph.parallel.Env().nranks
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) \
if use_data_parallel else fluid.CUDAPlace(0)
if not args.use_gpu:
place = fluid.CPUPlace()
elif not args.use_data_parallel:
place = fluid.CUDAPlace(0)
else:
#(data_parallel step1/6)
place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id)
#load pretrain
assert os.path.exists(args.weights), \
"Given dir {} not exist.".format(args.weights)
pre_state_dict = fluid.load_program_state(args.weights)
#for key in pre_state_dict.keys():
# print('pre_state_dict.key: {}'.format(key))
with fluid.dygraph.guard(place):
if use_data_parallel:
strategy = fluid.dygraph.parallel.prepare_context()
#1. init model
video_model = TSM_ResNet("TSM", train_config)
#2. set weights
param_state_dict = {}
model_dict = video_model.state_dict()
for key in model_dict.keys():
weight_name = model_dict[key].name
if weight_name in pre_state_dict.keys(
) and weight_name != "fc_0.w_0" and weight_name != "fc_0.b_0":
print('succ Load weight: {}, shape: {}'.format(
weight_name, pre_state_dict[weight_name].shape))
param_state_dict[key] = pre_state_dict[weight_name]
else:
print('fail Load weight: {}'.format(weight_name))
param_state_dict[key] = model_dict[key]
video_model.set_dict(param_state_dict)
#3. init optim
optimizer = create_optimizer(train_config.TRAIN,
video_model.parameters())
if use_data_parallel:
#(data_parallel step2,3/6)
strategy = fluid.dygraph.parallel.prepare_context()
video_model = fluid.dygraph.parallel.DataParallel(video_model,
strategy)
# 4. load checkpoint
if args.checkpoint:
assert os.path.exists(args.checkpoint + ".pdparams"), \
"Given dir {}.pdparams not exist.".format(args.checkpoint)
assert os.path.exists(args.checkpoint + ".pdopt"), \
"Given dir {}.pdopt not exist.".format(args.checkpoint)
para_dict, opti_dict = fluid.dygraph.load_dygraph(args.checkpoint)
video_model.set_dict(para_dict)
optimizer.set_dict(opti_dict)
# 5. reader
bs_denominator = 1
if args.use_gpu:
# check number of GPUs
gpus = os.getenv("CUDA_VISIBLE_DEVICES", "")
if gpus == "":
pass
......@@ -168,27 +236,36 @@ def train(args):
train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size /
bs_denominator)
train_reader = KineticsReader(mode="train", cfg=train_config)
train_reader = UCF101Reader(name="TSM", mode="train", cfg=train_config)
train_reader = train_reader.create_reader()
if use_data_parallel:
#(data_parallel step4/6)
train_reader = fluid.contrib.reader.distributed_batch_reader(
train_reader)
# 6. train loop
for epoch in range(train_config.TRAIN.epoch):
video_model.train()
total_loss = 0.0
total_acc1 = 0.0
total_acc5 = 0.0
total_sample = 0
t_last = time.time()
# 6.1 for each batch, call model() , backward(), and minimize()
for batch_id, data in enumerate(train_reader()):
t1 = time.time()
x_data = np.array([item[0] for item in data])
y_data = np.array([item[1] for item in data]).reshape([-1, 1])
imgs = to_variable(x_data)
labels = to_variable(y_data)
labels.stop_gradient = True
t2 = time.time()
outputs = video_model(imgs)
t3 = time.time()
loss = fluid.layers.cross_entropy(
input=outputs, label=labels, ignore_index=-1)
avg_loss = fluid.layers.mean(loss)
......@@ -198,34 +275,62 @@ def train(args):
acc_top5 = fluid.layers.accuracy(
input=outputs, label=labels, k=5)
current_step_lr = optimizer.current_step_lr()
if use_data_parallel:
#(data_parallel step5/6)
avg_loss = video_model.scale_loss(avg_loss)
avg_loss.backward()
video_model.apply_collective_grads()
else:
avg_loss.backward()
t4 = time.time()
optimizer.minimize(avg_loss)
video_model.clear_gradients()
t5 = time.time()
total_loss += avg_loss.numpy()[0]
total_acc1 += acc_top1.numpy()[0]
total_acc5 += acc_top5.numpy()[0]
total_sample += 1
print('TRAIN Epoch {}, iter {}, loss = {}, acc1 {}, acc5 {}'.
format(epoch, batch_id,
avg_loss.numpy()[0],
acc_top1.numpy()[0], acc_top5.numpy()[0]))
print(
'TRAIN Epoch: %d, iter: %d, loss: %.5f, acc1: %.5f, acc5: %.5f, lr: %.5f, forward_cost:%.5f s, backward_cost:%.5f s, minimize_cost:%.5f s, to_variable_cost: %.5f s, batch_cost: %.5f s, reader_cost: %.5f s'
% (epoch, batch_id, avg_loss.numpy()[0],
acc_top1.numpy()[0], acc_top5.numpy()[0],
current_step_lr, t3 - t2, t4 - t3, t5 - t4, t2 - t1,
t5 - t_last, t2 - t_last))
t_last = time.time()
print(
'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}'.
'TRAIN End, Epoch {}, avg_loss= {}, avg_acc1= {}, avg_acc5= {}, lr={}'.
format(epoch, total_loss / total_sample, total_acc1 /
total_sample, total_acc5 / total_sample))
total_sample, total_acc5 / total_sample,
current_step_lr))
# 6.2 save checkpoint
if local_rank == 0:
if not os.path.isdir(args.model_save_dir):
os.makedirs(args.model_save_dir)
model_path = os.path.join(
args.model_save_dir,
args.model_path_pre + "_epoch{}".format(epoch))
fluid.dygraph.save_dygraph(video_model.state_dict(), model_path)
fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
print('save_dygraph End, Epoch {}/{} '.format(
epoch, train_config.TRAIN.epoch))
# 6.3 validation
video_model.eval()
val(epoch, video_model, valid_config, args)
if fluid.dygraph.parallel.Env().local_rank == 0:
fluid.dygraph.save_dygraph(video_model.state_dict(), "final")
# 7. save final model
if local_rank == 0:
model_path = os.path.join(args.model_save_dir,
args.model_path_pre + "_final")
fluid.dygraph.save_dygraph(video_model.state_dict(), model_path)
fluid.dygraph.save_dygraph(optimizer.state_dict(), model_path)
logger.info('[TRAIN] training finished')
......
MODEL:
name: "TSM"
format: "frames"
num_classes: 101
seg_num: 8
seglen: 1
image_mean: [0.485, 0.456, 0.406]
image_std: [0.229, 0.224, 0.225]
num_layers: 50
topk: 5
TRAIN:
epoch: 80
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 64
use_gpu: True
num_gpus: 4
filelist: "./data/dataset/ucf101/ucf101_train_split_1_rawframes.txt"
learning_rate: 0.01
learning_rate_decay: 0.1
decay_epochs: [40, 60]
l2_weight_decay: 1e-4
momentum: 0.9
total_videos: 9537
fix_random_seed: False
VALID:
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 32
filelist: "./data/dataset/ucf101/ucf101_val_split_1_rawframes.txt"
TEST:
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 16
filelist: "./data/dataset/ucf101/ucf101_val_split_1_rawframes.txt"
MODEL:
name: "TSM"
format: "frames"
num_classes: 101
seg_num: 8
seglen: 1
image_mean: [0.485, 0.456, 0.406]
image_std: [0.229, 0.224, 0.225]
num_layers: 50
topk: 5
TRAIN:
epoch: 80
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 16
use_gpu: True
num_gpus: 1
filelist: "./data/dataset/ucf101/ucf101_train_split_1_rawframes.txt"
learning_rate: 0.01
learning_rate_decay: 0.1
decay_epochs: [40, 60]
l2_weight_decay: 1e-4
momentum: 0.9
total_videos: 9537
fix_random_seed: False
VALID:
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 32
filelist: "./data/dataset/ucf101/ucf101_val_split_1_rawframes.txt"
TEST:
short_size: 256
target_size: 224
num_reader_threads: 12
buf_size: 1024
batch_size: 16
filelist: "./data/dataset/ucf101/ucf101_val_split_1_rawframes.txt"
# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import cv2
import math
import random
import functools
try:
import cPickle as pickle
from cStringIO import StringIO
except ImportError:
import pickle
from io import BytesIO
import numpy as np
import paddle
import paddle.fluid as fluid
try:
from nvidia.dali.pipeline import Pipeline
import nvidia.dali.ops as ops
import nvidia.dali.types as types
import tempfile
from nvidia.dali.plugin.paddle import DALIGenericIterator
except:
Pipeline = object
print("DALI is not installed, you can improve performance if use DALI")
from PIL import Image, ImageEnhance
import logging
from reader_utils import DataReader
logger = logging.getLogger(__name__)
python_ver = sys.version_info
ucf101_root = "/ssd4/chaj/ucf101/"
class VideoRecord(object):
'''
define a class method which used to describe the frames information of videos
1. self._data[0] is the frames' path
2. self._data[1] is the number of frames
3. self._data[2] is the label of frames
'''
def __init__(self, row):
self._data = row
@property
def path(self):
return ucf101_root + "/rawframes/" + self._data[0]
@property
def num_frames(self):
return int(self._data[1])
@property
def label(self):
return int(self._data[2])
class UCF101Reader(DataReader):
"""
Data reader for kinetics dataset of two format mp4 and pkl.
1. mp4 or avi, the original format of kinetics400
2. pkl, the mp4 or avi was decoded previously and stored as pkl
3. frames, the mp4 or avi was decoded previously and stored as frames
In all cases, load the data, and then get the frame data in the form of numpy and label as an integer.
dataset cfg: format
num_classes
seg_num
short_size
target_size
num_reader_threads
buf_size
image_mean
image_std
batch_size
list
"""
def __init__(self, name, mode, cfg):
super(UCF101Reader, self).__init__(name, mode, cfg)
self.format = cfg.MODEL.format
self.num_classes = self.get_config_from_sec('model', 'num_classes')
self.seg_num = self.get_config_from_sec('model', 'seg_num')
self.seglen = self.get_config_from_sec('model', 'seglen')
self.seg_num = self.get_config_from_sec(mode, 'seg_num', self.seg_num)
self.short_size = self.get_config_from_sec(mode, 'short_size')
self.target_size = self.get_config_from_sec(mode, 'target_size')
self.num_reader_threads = self.get_config_from_sec(mode,
'num_reader_threads')
self.buf_size = self.get_config_from_sec(mode, 'buf_size')
self.fix_random_seed = self.get_config_from_sec(mode, 'fix_random_seed')
self.img_mean = np.array(cfg.MODEL.image_mean).reshape(
[3, 1, 1]).astype(np.float32)
self.img_std = np.array(cfg.MODEL.image_std).reshape(
[3, 1, 1]).astype(np.float32)
# set batch size and file list
self.batch_size = cfg[mode.upper()]['batch_size']
self.filelist = cfg[mode.upper()]['filelist']
# set num_trainers and trainer_id when distributed training is implemented
self.num_trainers = self.get_config_from_sec(mode, 'num_trainers', 1)
self.trainer_id = self.get_config_from_sec(mode, 'trainer_id', 0)
self.use_dali = self.get_config_from_sec(mode, 'use_dali', False)
self.dali_mean = cfg.MODEL.image_mean * (self.seg_num * self.seglen)
self.dali_std = cfg.MODEL.image_std * (self.seg_num * self.seglen)
if self.mode == 'infer':
self.video_path = cfg[mode.upper()]['video_path']
else:
self.video_path = ''
if self.fix_random_seed:
random.seed(0)
np.random.seed(0)
self.num_reader_threads = 1
def create_reader(self):
# if use_dali to improve performance
if self.use_dali:
return self.build_dali_reader()
# if set video_path for inference mode, just load this single video
if (self.mode == 'infer') and (self.video_path != ''):
# load video from file stored at video_path
_reader = self._inference_reader_creator(
self.video_path,
self.mode,
seg_num=self.seg_num,
seglen=self.seglen,
short_size=self.short_size,
target_size=self.target_size,
img_mean=self.img_mean,
img_std=self.img_std)
else:
assert os.path.exists(self.filelist), \
'{} not exist, please check the data list'.format(
self.filelist)
_reader = self._reader_creator(
self.filelist,
self.mode,
seg_num=self.seg_num,
seglen=self.seglen,
short_size=self.short_size,
target_size=self.target_size,
img_mean=self.img_mean,
img_std=self.img_std,
shuffle=(self.mode == 'train'),
num_threads=self.num_reader_threads,
buf_size=self.buf_size,
format=self.format)
def _batch_reader():
batch_out = []
for imgs, label in _reader():
if imgs is None:
continue
batch_out.append((imgs, label))
if len(batch_out) == self.batch_size:
yield batch_out
batch_out = []
return _batch_reader
def _inference_reader_creator(self, video_path, mode, seg_num, seglen,
short_size, target_size, img_mean, img_std):
def reader():
try:
imgs = mp4_loader(video_path, seg_num, seglen, mode)
if len(imgs) < 1:
logger.error('{} frame length {} less than 1.'.format(
video_path, len(imgs)))
yield None, None
except:
logger.error('Error when loading {}'.format(mp4_path))
yield None, None
imgs_ret = imgs_transform(imgs, mode, seg_num, seglen, short_size,
target_size, img_mean, img_std)
label_ret = video_path
yield imgs_ret, label_ret
return reader
def _reader_creator(self,
pickle_list,
mode,
seg_num,
seglen,
short_size,
target_size,
img_mean,
img_std,
shuffle=False,
num_threads=1,
buf_size=1024,
format='avi'):
def decode_mp4(sample, mode, seg_num, seglen, short_size, target_size,
img_mean, img_std):
sample = sample[0].split(' ')
mp4_path = ucf101_root + "/videos/" + sample[0] + ".avi"
# when infer, we store vid as label
label = int(sample[1]) - 1
try:
imgs = mp4_loader(mp4_path, seg_num, seglen, mode)
if len(imgs) < 1:
logger.error('{} frame length {} less than 1.'.format(
mp4_path, len(imgs)))
return None, None
except:
logger.error('Error when loading {}'.format(mp4_path))
return None, None
return imgs_transform(
imgs,
mode,
seg_num,
seglen,
short_size,
target_size,
img_mean,
img_std,
name=self.name), label
def decode_pickle(sample, mode, seg_num, seglen, short_size,
target_size, img_mean, img_std):
pickle_path = sample[0]
try:
if python_ver < (3, 0):
data_loaded = pickle.load(open(pickle_path, 'rb'))
else:
data_loaded = pickle.load(
open(pickle_path, 'rb'), encoding='bytes')
vid, label, frames = data_loaded
if len(frames) < 1:
logger.error('{} frame length {} less than 1.'.format(
pickle_path, len(frames)))
return None, None
except:
logger.info('Error when loading {}'.format(pickle_path))
return None, None
if mode == 'train' or mode == 'valid' or mode == 'test':
ret_label = label
elif mode == 'infer':
ret_label = vid
imgs = video_loader(frames, seg_num, seglen, mode)
return imgs_transform(
imgs,
mode,
seg_num,
seglen,
short_size,
target_size,
img_mean,
img_std,
name=self.name), ret_label
def decode_frames(sample, mode, seg_num, seglen, short_size,
target_size, img_mean, img_std):
recode = VideoRecord(sample[0].split(' '))
frames_dir_path = recode.path
# when infer, we store vid as label
label = recode.label
try:
imgs = frames_loader(recode, seg_num, seglen, mode)
if len(imgs) < 1:
logger.error('{} frame length {} less than 1.'.format(
frames_dir_path, len(imgs)))
return None, None
except:
logger.error('Error when loading {}'.format(frames_dir_path))
return None, None
return imgs_transform(
imgs,
mode,
seg_num,
seglen,
short_size,
target_size,
img_mean,
img_std,
name=self.name), label
def reader_():
with open(pickle_list) as flist:
full_lines = [line.strip() for line in flist]
if self.mode == 'train':
if (not hasattr(reader_, 'seed')):
reader_.seed = 0
random.Random(reader_.seed).shuffle(full_lines)
print("reader shuffle seed", reader_.seed)
if reader_.seed is not None:
reader_.seed += 1
per_node_lines = int(
math.ceil(len(full_lines) * 1.0 / self.num_trainers))
total_lines = per_node_lines * self.num_trainers
# aligned full_lines so that it can evenly divisible
full_lines += full_lines[:(total_lines - len(full_lines))]
assert len(full_lines) == total_lines
# trainer get own sample
lines = full_lines[self.trainer_id:total_lines:
self.num_trainers]
logger.info("trainerid %d, trainer_count %d" %
(self.trainer_id, self.num_trainers))
logger.info(
"read images from %d, length: %d, lines length: %d, total: %d"
% (self.trainer_id * per_node_lines, per_node_lines,
len(lines), len(full_lines)))
assert len(lines) == per_node_lines
for line in lines:
pickle_path = line.strip()
yield [pickle_path]
if format == 'pkl':
decode_func = decode_pickle
if format == 'frames':
decode_func = decode_frames
elif format == 'mp4' or 'avi':
decode_func = decode_mp4
else:
raise "Not implemented format {}".format(format)
mapper = functools.partial(
decode_func,
mode=mode,
seg_num=seg_num,
seglen=seglen,
short_size=short_size,
target_size=target_size,
img_mean=img_mean,
img_std=img_std)
return fluid.io.xmap_readers(mapper, reader_, num_threads, buf_size)
def build_dali_reader(self):
"""
build dali training reader
"""
def reader_():
with open(self.filelist) as flist:
full_lines = [line for line in flist]
if self.mode == 'train':
if (not hasattr(reader_, 'seed')):
reader_.seed = 0
random.Random(reader_.seed).shuffle(full_lines)
print("reader shuffle seed", reader_.seed)
if reader_.seed is not None:
reader_.seed += 1
per_node_lines = int(
math.ceil(len(full_lines) * 1.0 / self.num_trainers))
total_lines = per_node_lines * self.num_trainers
# aligned full_lines so that it can evenly divisible
full_lines += full_lines[:(total_lines - len(full_lines))]
assert len(full_lines) == total_lines
# trainer get own sample
lines = full_lines[self.trainer_id:total_lines:
self.num_trainers]
assert len(lines) == per_node_lines
logger.info("trainerid %d, trainer_count %d" %
(self.trainer_id, self.num_trainers))
logger.info(
"read images from %d, length: %d, lines length: %d, total: %d"
% (self.trainer_id * per_node_lines, per_node_lines,
len(lines), len(full_lines)))
video_files = ''
for item in lines:
video_files += item
tf = tempfile.NamedTemporaryFile()
tf.write(str.encode(video_files))
tf.flush()
video_files = tf.name
device_id = int(os.getenv('FLAGS_selected_gpus', 0))
print('---------- device id -----------', device_id)
if self.mode == 'train':
pipe = VideoPipe(
batch_size=self.batch_size,
num_threads=1,
device_id=device_id,
file_list=video_files,
sequence_length=self.seg_num * self.seglen,
seg_num=self.seg_num,
seg_length=self.seglen,
resize_shorter_scale=self.short_size,
crop_target_size=self.target_size,
is_training=(self.mode == 'train'),
dali_mean=self.dali_mean,
dali_std=self.dali_std)
else:
pipe = VideoTestPipe(
batch_size=self.batch_size,
num_threads=1,
device_id=device_id,
file_list=video_files,
sequence_length=self.seg_num * self.seglen,
seg_num=self.seg_num,
seg_length=self.seglen,
resize_shorter_scale=self.short_size,
crop_target_size=self.target_size,
is_training=(self.mode == 'train'),
dali_mean=self.dali_mean,
dali_std=self.dali_std)
logger.info(
'initializing dataset, it will take several minutes if it is too large .... '
)
video_loader = DALIGenericIterator(
[pipe], ['image', 'label'],
len(lines),
dynamic_shape=True,
auto_reset=True)
return video_loader
dali_reader = reader_()
def ret_reader():
for data in dali_reader:
yield data[0]['image'], data[0]['label']
return ret_reader
class VideoPipe(Pipeline):
def __init__(self,
batch_size,
num_threads,
device_id,
file_list,
sequence_length,
seg_num,
seg_length,
resize_shorter_scale,
crop_target_size,
is_training=False,
initial_prefetch_size=10,
num_shards=1,
shard_id=0,
dali_mean=0.,
dali_std=1.0):
super(VideoPipe, self).__init__(batch_size, num_threads, device_id)
self.input = ops.VideoReader(
device="gpu",
file_list=file_list,
sequence_length=sequence_length,
seg_num=seg_num,
seg_length=seg_length,
is_training=is_training,
num_shards=num_shards,
shard_id=shard_id,
random_shuffle=is_training,
initial_fill=initial_prefetch_size)
# the sequece data read by ops.VideoReader is of shape [F, H, W, C]
# Because the ops.Resize does not support sequence data,
# it will be transposed into [H, W, F, C],
# then reshaped to [H, W, FC], and then resized like a 2-D image.
self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
self.reshape = ops.Reshape(
device="gpu", rel_shape=[1.0, 1.0, -1], layout='HWC')
self.resize = ops.Resize(
device="gpu", resize_shorter=resize_shorter_scale)
# crops and mirror are applied by ops.CropMirrorNormalize.
# Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
# It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
self.pos_rng_x = ops.Uniform(range=(0.0, 1.0))
self.pos_rng_y = ops.Uniform(range=(0.0, 1.0))
self.mirror_generator = ops.Uniform(range=(0.0, 1.0))
self.cast_mirror = ops.Cast(dtype=types.DALIDataType.INT32)
self.crop_mirror_norm = ops.CropMirrorNormalize(
device="gpu",
crop=[crop_target_size, crop_target_size],
mean=dali_mean,
std=dali_std)
self.reshape_back = ops.Reshape(
device="gpu",
shape=[
seg_num, seg_length * 3, crop_target_size, crop_target_size
],
layout='FCHW')
self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
def define_graph(self):
output, label = self.input(name="Reader")
output = self.transpose(output)
output = self.reshape(output)
output = self.resize(output)
output = output / 255.
pos_x = self.pos_rng_x()
pos_y = self.pos_rng_y()
mirror_flag = self.mirror_generator()
mirror_flag = (mirror_flag > 0.5)
mirror_flag = self.cast_mirror(mirror_flag)
output = self.crop_mirror_norm(
output, crop_pos_x=pos_x, crop_pos_y=pos_y, mirror=mirror_flag)
output = self.reshape_back(output)
label = self.cast_label(label)
return output, label
class VideoTestPipe(Pipeline):
def __init__(self,
batch_size,
num_threads,
device_id,
file_list,
sequence_length,
seg_num,
seg_length,
resize_shorter_scale,
crop_target_size,
is_training=False,
initial_prefetch_size=10,
num_shards=1,
shard_id=0,
dali_mean=0.,
dali_std=1.0):
super(VideoTestPipe, self).__init__(batch_size, num_threads, device_id)
self.input = ops.VideoReader(
device="gpu",
file_list=file_list,
sequence_length=sequence_length,
seg_num=seg_num,
seg_length=seg_length,
is_training=is_training,
num_shards=num_shards,
shard_id=shard_id,
random_shuffle=is_training,
initial_fill=initial_prefetch_size)
# the sequece data read by ops.VideoReader is of shape [F, H, W, C]
# Because the ops.Resize does not support sequence data,
# it will be transposed into [H, W, F, C],
# then reshaped to [H, W, FC], and then resized like a 2-D image.
self.transpose = ops.Transpose(device="gpu", perm=[1, 2, 0, 3])
self.reshape = ops.Reshape(
device="gpu", rel_shape=[1.0, 1.0, -1], layout='HWC')
self.resize = ops.Resize(
device="gpu", resize_shorter=resize_shorter_scale)
# crops and mirror are applied by ops.CropMirrorNormalize.
# Normalization will be implemented in paddle due to the difficulty of dimension broadcast,
# It is not sure whether dimension broadcast can be implemented correctly by dali, just take the Paddle Op instead.
self.crop_mirror_norm = ops.CropMirrorNormalize(
device="gpu",
crop=[crop_target_size, crop_target_size],
crop_pos_x=0.5,
crop_pos_y=0.5,
mirror=0,
mean=dali_mean,
std=dali_std)
self.reshape_back = ops.Reshape(
device="gpu",
shape=[
seg_num, seg_length * 3, crop_target_size, crop_target_size
],
layout='FCHW')
self.cast_label = ops.Cast(device="gpu", dtype=types.DALIDataType.INT64)
def define_graph(self):
output, label = self.input(name="Reader")
output = self.transpose(output)
output = self.reshape(output)
output = self.resize(output)
output = output / 255.
#output = self.crop(output, crop_pos_x=pos_x, crop_pos_y=pos_y)
output = self.crop_mirror_norm(output)
output = self.reshape_back(output)
label = self.cast_label(label)
return output, label
def imgs_transform(imgs,
mode,
seg_num,
seglen,
short_size,
target_size,
img_mean,
img_std,
name=''):
imgs = group_scale(imgs, short_size)
if mode == 'train':
if name == "TSM":
imgs = group_multi_scale_crop(imgs, short_size)
imgs = group_random_crop(imgs, target_size)
imgs = group_random_flip(imgs)
else:
imgs = group_center_crop(imgs, target_size)
np_imgs = (np.array(imgs[0]).astype('float32').transpose(
(2, 0, 1))).reshape(1, 3, target_size, target_size) / 255
for i in range(len(imgs) - 1):
img = (np.array(imgs[i + 1]).astype('float32').transpose(
(2, 0, 1))).reshape(1, 3, target_size, target_size) / 255
np_imgs = np.concatenate((np_imgs, img))
imgs = np_imgs
imgs -= img_mean
imgs /= img_std
imgs = np.reshape(imgs, (seg_num, seglen * 3, target_size, target_size))
return imgs
def group_multi_scale_crop(img_group,
target_size,
scales=None,
max_distort=1,
fix_crop=True,
more_fix_crop=True):
scales = scales if scales is not None else [1, .875, .75, .66]
input_size = [target_size, target_size]
im_size = img_group[0].size
# get random crop offset
def _sample_crop_size(im_size):
image_w, image_h = im_size[0], im_size[1]
base_size = min(image_w, image_h)
crop_sizes = [int(base_size * x) for x in scales]
crop_h = [
input_size[1] if abs(x - input_size[1]) < 3 else x
for x in crop_sizes
]
crop_w = [
input_size[0] if abs(x - input_size[0]) < 3 else x
for x in crop_sizes
]
pairs = []
for i, h in enumerate(crop_h):
for j, w in enumerate(crop_w):
if abs(i - j) <= max_distort:
pairs.append((w, h))
crop_pair = random.choice(pairs)
if not fix_crop:
w_offset = random.randint(0, image_w - crop_pair[0])
h_offset = random.randint(0, image_h - crop_pair[1])
else:
w_step = (image_w - crop_pair[0]) / 4
h_step = (image_h - crop_pair[1]) / 4
ret = list()
ret.append((0, 0)) # upper left
if w_step != 0:
ret.append((4 * w_step, 0)) # upper right
if h_step != 0:
ret.append((0, 4 * h_step)) # lower left
if h_step != 0 and w_step != 0:
ret.append((4 * w_step, 4 * h_step)) # lower right
if h_step != 0 or w_step != 0:
ret.append((2 * w_step, 2 * h_step)) # center
if more_fix_crop:
ret.append((0, 2 * h_step)) # center left
ret.append((4 * w_step, 2 * h_step)) # center right
ret.append((2 * w_step, 4 * h_step)) # lower center
ret.append((2 * w_step, 0 * h_step)) # upper center
ret.append((1 * w_step, 1 * h_step)) # upper left quarter
ret.append((3 * w_step, 1 * h_step)) # upper right quarter
ret.append((1 * w_step, 3 * h_step)) # lower left quarter
ret.append((3 * w_step, 3 * h_step)) # lower righ quarter
w_offset, h_offset = random.choice(ret)
return crop_pair[0], crop_pair[1], w_offset, h_offset
crop_w, crop_h, offset_w, offset_h = _sample_crop_size(im_size)
crop_img_group = [
img.crop((offset_w, offset_h, offset_w + crop_w, offset_h + crop_h))
for img in img_group
]
ret_img_group = [
img.resize((input_size[0], input_size[1]), Image.BILINEAR)
for img in crop_img_group
]
return ret_img_group
def group_random_crop(img_group, target_size):
w, h = img_group[0].size
th, tw = target_size, target_size
assert (w >= target_size) and (h >= target_size), \
"image width({}) and height({}) should be larger than crop size".format(
w, h, target_size)
out_images = []
x1 = random.randint(0, w - tw)
y1 = random.randint(0, h - th)
for img in img_group:
if w == tw and h == th:
out_images.append(img)
else:
out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
return out_images
def group_random_flip(img_group):
v = random.random()
if v < 0.5:
ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
return ret
else:
return img_group
def group_center_crop(img_group, target_size):
img_crop = []
for img in img_group:
w, h = img.size
th, tw = target_size, target_size
assert (w >= target_size) and (h >= target_size), \
"image width({}) and height({}) should be larger than crop size".format(
w, h, target_size)
x1 = int(round((w - tw) / 2.))
y1 = int(round((h - th) / 2.))
img_crop.append(img.crop((x1, y1, x1 + tw, y1 + th)))
return img_crop
def group_scale(imgs, target_size):
resized_imgs = []
for i in range(len(imgs)):
img = imgs[i]
w, h = img.size
if (w <= h and w == target_size) or (h <= w and h == target_size):
resized_imgs.append(img)
continue
if w < h:
ow = target_size
oh = int(target_size * 4.0 / 3.0)
resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
else:
oh = target_size
ow = int(target_size * 4.0 / 3.0)
resized_imgs.append(img.resize((ow, oh), Image.BILINEAR))
return resized_imgs
def imageloader(buf):
if isinstance(buf, str):
img = Image.open(StringIO(buf))
else:
img = Image.open(BytesIO(buf))
return img.convert('RGB')
def video_loader(frames, nsample, seglen, mode):
videolen = len(frames)
average_dur = int(videolen / nsample)
imgs = []
for i in range(nsample):
idx = 0
if mode == 'train':
if average_dur >= seglen:
idx = random.randint(0, average_dur - seglen)
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
else:
if average_dur >= seglen:
idx = (average_dur - seglen) // 2
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
for jj in range(idx, idx + seglen):
imgbuf = frames[int(jj % videolen)]
img = imageloader(imgbuf)
imgs.append(img)
return imgs
def mp4_loader(filepath, nsample, seglen, mode):
cap = cv2.VideoCapture(filepath)
videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
sampledFrames = []
for i in range(videolen):
ret, frame = cap.read()
# maybe first frame is empty
if ret == False:
continue
img = frame[:, :, ::-1]
sampledFrames.append(img)
average_dur = int(len(sampledFrames) / nsample)
imgs = []
for i in range(nsample):
idx = 0
if mode == 'train':
if average_dur >= seglen:
idx = random.randint(0, average_dur - seglen)
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
else:
if average_dur >= seglen:
idx = (average_dur - 1) // 2
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
for jj in range(idx, idx + seglen):
imgbuf = sampledFrames[int(jj % len(sampledFrames))]
img = Image.fromarray(imgbuf, mode='RGB')
imgs.append(img)
return imgs
# the additional function which used to load the frames
def load_image(directory, idx):
img = cv2.imread(os.path.join(directory, 'img_{:05d}.jpg'.format(idx)))
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
return img
def frames_loader(recode, nsample, seglen, mode):
imgpath, num_frames = recode.path, recode.num_frames
average_dur = int(num_frames / nsample)
imgs = []
for i in range(nsample):
idx = 0
if mode == 'train':
if average_dur >= seglen:
idx = random.randint(0, average_dur - seglen)
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
else:
if average_dur >= seglen:
idx = (average_dur - 1) // 2
idx += i * average_dur
elif average_dur >= 1:
idx += i * average_dur
else:
idx = i
for jj in range(idx, idx + seglen):
img = load_image(imgpath, jj + 1)
img = Image.fromarray(img, mode='RGB')
imgs.append(img)
return imgs
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册