提交 71d2297c 编写于 作者: D dengkaipeng

add TSM model

上级 a5684e58
[MODEL]
name = "TSM"
format = "pkl"
num_classes = 400
seg_num = 8
seglen = 1
image_mean = [0.485, 0.456, 0.406]
image_std = [0.229, 0.224, 0.225]
num_layers = 50
[TRAIN]
epoch = 65
short_size = 256
target_size = 224
num_reader_threads = 12
buf_size = 1024
batch_size = 128
use_gpu = True
num_gpus = 8
filelist = "./dataset/kinetics/train.list"
learning_rate = 0.01
learning_rate_decay = 0.1
decay_epochs = [40, 60]
l2_weight_decay = 1e-4
momentum = 0.9
total_videos = 239781
[VALID]
short_size = 256
target_size = 224
num_reader_threads = 12
buf_size = 1024
batch_size = 128
filelist = "./dataset/kinetics/val.list"
[TEST]
short_size = 256
target_size = 224
num_reader_threads = 12
buf_size = 1024
batch_size = 16
filelist = "./dataset/kinetics/test.list"
[INFER]
short_size = 256
target_size = 224
num_reader_threads = 12
buf_size = 1024
batch_size = 1
filelist = "./dataset/kinetics/infer.list"
......@@ -3,10 +3,11 @@ from .feature_reader import FeatureReader
from .kinetics_reader import KineticsReader
from .nonlocal_reader import NonlocalReader
# regist reader, sort by alphabet
regist_reader("ATTENTIONCLUSTER", FeatureReader)
regist_reader("NEXTVLAD", FeatureReader)
regist_reader("ATTENTIONLSTM", FeatureReader)
regist_reader("TSN", KineticsReader)
regist_reader("NEXTVLAD", FeatureReader)
regist_reader("NONLOCAL", NonlocalReader)
regist_reader("TSM", KineticsReader)
regist_reader("TSN", KineticsReader)
regist_reader("STNET", KineticsReader)
regist_reader("NONLOCAL", NonlocalReader)
......@@ -187,10 +187,11 @@ def get_metrics(name, mode, cfg):
return metrics_zoo.get(name, mode, cfg)
regist_metrics("NEXTVLAD", Youtube8mMetrics)
regist_metrics("ATTENTIONLSTM", Youtube8mMetrics)
# sort by alphabet
regist_metrics("ATTENTIONCLUSTER", Youtube8mMetrics)
regist_metrics("TSN", Kinetics400Metrics)
regist_metrics("ATTENTIONLSTM", Youtube8mMetrics)
regist_metrics("NEXTVLAD", Youtube8mMetrics)
regist_metrics("NONLOCAL", MulticropMetrics)
regist_metrics("TSM", Kinetics400Metrics)
regist_metrics("TSN", Kinetics400Metrics)
regist_metrics("STNET", Kinetics400Metrics)
regist_metrics("NONLOCAL", MulticropMetrics)
from .model import regist_model, get_model
from .attention_cluster import AttentionCluster
from .attention_lstm import AttentionLSTM
from .nextvlad import NEXTVLAD
from .nonlocal_model import NonLocal
from .tsm import TSM
from .tsn import TSN
from .stnet import STNET
from .attention_lstm import AttentionLSTM
from .nonlocal_model import NonLocal
# regist models
# regist models, sort by alphabet
regist_model("AttentionCluster", AttentionCluster)
regist_model("AttentionLSTM", AttentionLSTM)
regist_model("NEXTVLAD", NEXTVLAD)
regist_model('NONLOCAL', NonLocal)
regist_model("TSM", TSM)
regist_model("TSN", TSN)
regist_model("STNET", STNET)
regist_model("AttentionLSTM", AttentionLSTM)
regist_model('NONLOCAL', NonLocal)
# TSM 视频分类模型
---
## 内容
- [模型简介](#模型简介)
- [数据准备](#数据准备)
- [模型训练](#模型训练)
- [模型评估](#模型评估)
- [模型推断](#模型推断)
- [参考论文](#参考论文)
## 模型简介
TSM(Temporal Shift Module),Backbone采用ResNet-50结构。
详细内容请参考论文[Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383)
## 数据准备
TSM的训练数据采用由DeepMind公布的Kinetics-400动作识别数据集。数据下载及准备请参考[数据说明](../../dataset/README.md)
## 模型训练
数据准备完毕后,可以通过如下两种方式启动训练:
python train.py --model-name=TSM
--config=./configs/tsm.txt
--save-dir=checkpoints
--log-interval=10
--valid-interval=1
bash scripts/train/train_tsm,.sh
- 可下载已发布模型[model](https://paddlemodels.bj.bcebos.com/video_classification/tsm_kinetics.tar.gz)通过`--resume`指定权重存放路径进行finetune等开发
**数据读取器说明:** 模型读取Kinetics-400数据集中的`mp4`数据,每条数据抽取`seg_num`段,每段抽取1帧图像,对每帧图像做随机增强后,缩放至`target_size`
**训练策略:**
* 采用Momentum优化算法训练,momentum=0.9
* 权重衰减系数为1e-4
## 模型评估
可通过如下两种方式进行模型评估:
python test.py --model-name=TSM
--config=configs/tsm.txt
--log-interval=1
--weights=$PATH_TO_WEIGHTS
bash scripts/test/test_tsm.sh
- 使用`scripts/test/test_tsm.sh`进行评估时,需要修改脚本中的`--weights`参数指定需要评估的权重。
- 若未指定`--weights`参数,脚本会下载已发布模型[model](https://paddlemodels.bj.bcebos.com/video_classification/tsm_kinetics.tar.gz)进行评估
当取如下参数时,在Kinetics400的validation数据集下评估精度如下:
| seg\_num | target\_size | Top-1 |
| :------: | :----------: | :----: |
| 8 | 224 | 0.70 |
## 模型推断
可通过如下命令进行模型推断:
python infer.py --model-name=TSM
--config=configs/tsm.txt
--log-interval=1
--weights=$PATH_TO_WEIGHTS
--filelist=$FILELIST
- 模型推断结果存储于`TSM_infer_result`中,通过`pickle`格式存储。
- 若未指定`--weights`参数,脚本会下载已发布模型[model](https://paddlemodels.bj.bcebos.com/video_classification/tsm_kinetics.tar.gz)进行推断
## 参考论文
- [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383), Ji Lin, Chuang Gan, Song Han
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import paddle.fluid as fluid
from paddle.fluid import ParamAttr
from ..model import ModelBase
from .tsm_res_model import TSM_ResNet
import logging
logger = logging.getLogger(__name__)
__all__ = ["TSM"]
class TSM(ModelBase):
def __init__(self, name, cfg, mode='train'):
super(TSM, self).__init__(name, cfg, mode=mode)
self.get_config()
def get_config(self):
self.num_classes = self.get_config_from_sec('model', 'num_classes')
self.seg_num = self.get_config_from_sec('model', 'seg_num')
self.seglen = self.get_config_from_sec('model', 'seglen')
self.image_mean = self.get_config_from_sec('model', 'image_mean')
self.image_std = self.get_config_from_sec('model', 'image_std')
self.num_layers = self.get_config_from_sec('model', 'num_layers')
self.num_epochs = self.get_config_from_sec('train', 'epoch')
self.total_videos = self.get_config_from_sec('train', 'total_videos')
self.base_learning_rate = self.get_config_from_sec('train',
'learning_rate')
self.learning_rate_decay = self.get_config_from_sec(
'train', 'learning_rate_decay')
self.decay_epochs = self.get_config_from_sec('train', 'decay_epochs')
self.l2_weight_decay = self.get_config_from_sec('train',
'l2_weight_decay')
self.momentum = self.get_config_from_sec('train', 'momentum')
self.target_size = self.get_config_from_sec(self.mode, 'target_size')
self.batch_size = self.get_config_from_sec(self.mode, 'batch_size')
def build_input(self, use_pyreader=True):
image_shape = [3, self.target_size, self.target_size]
image_shape[0] = image_shape[0] * self.seglen
image_shape = [self.seg_num] + image_shape
self.use_pyreader = use_pyreader
if use_pyreader:
assert self.mode != 'infer', \
'pyreader is not recommendated when infer, please set use_pyreader to be false.'
py_reader = fluid.layers.py_reader(
capacity=100,
shapes=[[-1] + image_shape, [-1] + [1]],
dtypes=['float32', 'int64'],
name='train_py_reader'
if self.is_training else 'test_py_reader',
use_double_buffer=True)
image, label = fluid.layers.read_file(py_reader)
self.py_reader = py_reader
else:
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
if self.mode != 'infer':
label = fluid.layers.data(
name='label', shape=[1], dtype='int64')
else:
label = None
self.feature_input = [image]
self.label_input = label
def build_model(self):
videomodel = TSM_ResNet(
layers=self.num_layers,
seg_num=self.seg_num,
is_training=self.is_training)
out = videomodel.net(input=self.feature_input[0],
class_dim=self.num_classes)
self.network_outputs = [out]
def optimizer(self):
assert self.mode == 'train', "optimizer only can be get in train mode"
total_videos = self.total_videos
step = int(total_videos / self.batch_size + 1)
bd = [e * step for e in self.decay_epochs]
base_lr = self.base_learning_rate
lr_decay = self.learning_rate_decay
lr = [base_lr, base_lr * lr_decay, base_lr * lr_decay * lr_decay]
l2_weight_decay = self.l2_weight_decay
momentum = self.momentum
optimizer = fluid.optimizer.Momentum(
learning_rate=fluid.layers.piecewise_decay(
boundaries=bd, values=lr),
momentum=momentum,
regularization=fluid.regularizer.L2Decay(l2_weight_decay))
return optimizer
def loss(self):
assert self.mode != 'infer', "invalid loss calculationg in infer mode"
cost = fluid.layers.cross_entropy(input=self.network_outputs[0], \
label=self.label_input, ignore_index=-1)
self.loss_ = fluid.layers.mean(x=cost)
return self.loss_
def outputs(self):
return self.network_outputs
def feeds(self):
return self.feature_input if self.mode == 'infer' else self.feature_input + [
self.label_input
]
def pretrain_info(self):
return ('ResNet50_pretrained', 'https://paddlemodels.bj.bcebos.com/video_classification/ResNet50_pretrained.tar.gz')
def weights_info(self):
return ('tsm_kinetics',
'https://paddlemodels.bj.bcebos.com/video_classification/tsm_kinetics.tar.gz')
def load_pretrain_params(self, exe, pretrain, prog, place):
def is_parameter(var):
return isinstance(var, fluid.framework.Parameter) and (not ("fc_0" in var.name))
logger.info("Load pretrain weights from {}, exclude fc layer.".format(pretrain))
vars = filter(is_parameter, prog.list_vars())
fluid.io.load_vars(exe, pretrain, vars=vars, main_program=prog)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import time
import sys
import paddle.fluid as fluid
import math
class TSM_ResNet():
def __init__(self, layers=50, seg_num=8, is_training=False):
self.layers = layers
self.seg_num = seg_num
self.is_training = is_training
def shift_module(self, input):
output = fluid.layers.temporal_shift(input, self.seg_num, 1.0 / 8)
return output
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=fluid.param_attr.ParamAttr(name=name+"_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(input=conv, act=act,
is_test=(not self.is_training),
param_attr=fluid.param_attr.ParamAttr(name=bn_name+"_scale"),
bias_attr=fluid.param_attr.ParamAttr(bn_name+'_offset'),
moving_mean_name=bn_name+"_mean",
moving_variance_name=bn_name+'_variance')
def shortcut(self, input, ch_out, stride, name):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name):
shifted = self.shift_module(input)
conv0 = self.conv_bn_layer(
input=shifted, num_filters=num_filters, filter_size=1, act='relu',
name=name+"_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu', name=name+"_branch2b")
conv2 = self.conv_bn_layer(
input=conv1, num_filters=num_filters * 4, filter_size=1, act=None, name=name+"_branch2c")
short = self.shortcut(input, num_filters * 4, stride, name=name+"_branch1")
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def net(self, input, class_dim=101):
layers = self.layers
seg_num = self.seg_num
supported_layers = [50, 101, 152]
if layers not in supported_layers:
print("supported layers are", supported_layers, \
"but input layer is ", layers)
exit()
# reshape input
channels = input.shape[2]
short_size = input.shape[3]
input = fluid.layers.reshape(
x=input, shape=[-1, channels, short_size, short_size])
if layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
num_filters = [64, 128, 256, 512]
conv = self.conv_bn_layer(
input=input, num_filters=64, filter_size=7, stride=2, act='relu', name='conv1')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152] and block == 2:
if i == 0:
conv_name = "res" + str(block+2) + "a"
else:
conv_name = "res" + str(block+2) + "b" + str(i)
else:
conv_name = "res" + str(block+2) + chr(97+i)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
name=conv_name)
pool = fluid.layers.pool2d(
input=conv, pool_size=7, pool_type='avg', global_pooling=True)
dropout = fluid.layers.dropout(x=pool, dropout_prob=0.5, is_test=(not self.is_training))
feature = fluid.layers.reshape(
x=dropout, shape=[-1, seg_num, pool.shape[1]])
out = fluid.layers.reduce_mean(feature, dim=1)
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
out = fluid.layers.fc(input=out,
size=class_dim,
act='softmax',
param_attr=fluid.param_attr.ParamAttr(
initializer=fluid.initializer.Uniform(-stdv,
stdv)),
bias_attr=fluid.param_attr.ParamAttr(learning_rate=2.0,
regularizer=fluid.regularizer.L2Decay(0.)))
return out
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册