From 71d2297cc0d345b20edec91fa8038dca80a4a67a Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 4 Apr 2019 13:20:34 +0000 Subject: [PATCH] add TSM model --- PaddleCV/video/configs/tsm.txt | 51 +++++++ PaddleCV/video/datareader/__init__.py | 7 +- PaddleCV/video/metrics/metrics_util.py | 9 +- PaddleCV/video/models/__init__.py | 12 +- PaddleCV/video/models/tsm/README.md | 83 +++++++++++ PaddleCV/video/models/tsm/__init__.py | 1 + PaddleCV/video/models/tsm/tsm.py | 138 ++++++++++++++++++ PaddleCV/video/models/tsm/tsm_res_model.py | 154 +++++++++++++++++++++ 8 files changed, 443 insertions(+), 12 deletions(-) create mode 100755 PaddleCV/video/configs/tsm.txt create mode 100644 PaddleCV/video/models/tsm/README.md create mode 100644 PaddleCV/video/models/tsm/__init__.py create mode 100644 PaddleCV/video/models/tsm/tsm.py create mode 100644 PaddleCV/video/models/tsm/tsm_res_model.py diff --git a/PaddleCV/video/configs/tsm.txt b/PaddleCV/video/configs/tsm.txt new file mode 100755 index 00000000..6410bee5 --- /dev/null +++ b/PaddleCV/video/configs/tsm.txt @@ -0,0 +1,51 @@ +[MODEL] +name = "TSM" +format = "pkl" +num_classes = 400 +seg_num = 8 +seglen = 1 +image_mean = [0.485, 0.456, 0.406] +image_std = [0.229, 0.224, 0.225] +num_layers = 50 + +[TRAIN] +epoch = 65 +short_size = 256 +target_size = 224 +num_reader_threads = 12 +buf_size = 1024 +batch_size = 128 +use_gpu = True +num_gpus = 8 +filelist = "./dataset/kinetics/train.list" +learning_rate = 0.01 +learning_rate_decay = 0.1 +decay_epochs = [40, 60] +l2_weight_decay = 1e-4 +momentum = 0.9 +total_videos = 239781 + +[VALID] +short_size = 256 +target_size = 224 +num_reader_threads = 12 +buf_size = 1024 +batch_size = 128 +filelist = "./dataset/kinetics/val.list" + +[TEST] +short_size = 256 +target_size = 224 +num_reader_threads = 12 +buf_size = 1024 +batch_size = 16 +filelist = "./dataset/kinetics/test.list" + +[INFER] +short_size = 256 +target_size = 224 +num_reader_threads = 12 +buf_size = 1024 +batch_size = 1 +filelist = "./dataset/kinetics/infer.list" + diff --git a/PaddleCV/video/datareader/__init__.py b/PaddleCV/video/datareader/__init__.py index 8f945159..ee898672 100644 --- a/PaddleCV/video/datareader/__init__.py +++ b/PaddleCV/video/datareader/__init__.py @@ -3,10 +3,11 @@ from .feature_reader import FeatureReader from .kinetics_reader import KineticsReader from .nonlocal_reader import NonlocalReader +# regist reader, sort by alphabet regist_reader("ATTENTIONCLUSTER", FeatureReader) -regist_reader("NEXTVLAD", FeatureReader) regist_reader("ATTENTIONLSTM", FeatureReader) -regist_reader("TSN", KineticsReader) +regist_reader("NEXTVLAD", FeatureReader) +regist_reader("NONLOCAL", NonlocalReader) regist_reader("TSM", KineticsReader) +regist_reader("TSN", KineticsReader) regist_reader("STNET", KineticsReader) -regist_reader("NONLOCAL", NonlocalReader) diff --git a/PaddleCV/video/metrics/metrics_util.py b/PaddleCV/video/metrics/metrics_util.py index f7693491..4db704c2 100644 --- a/PaddleCV/video/metrics/metrics_util.py +++ b/PaddleCV/video/metrics/metrics_util.py @@ -187,10 +187,11 @@ def get_metrics(name, mode, cfg): return metrics_zoo.get(name, mode, cfg) -regist_metrics("NEXTVLAD", Youtube8mMetrics) -regist_metrics("ATTENTIONLSTM", Youtube8mMetrics) +# sort by alphabet regist_metrics("ATTENTIONCLUSTER", Youtube8mMetrics) -regist_metrics("TSN", Kinetics400Metrics) +regist_metrics("ATTENTIONLSTM", Youtube8mMetrics) +regist_metrics("NEXTVLAD", Youtube8mMetrics) +regist_metrics("NONLOCAL", MulticropMetrics) regist_metrics("TSM", Kinetics400Metrics) +regist_metrics("TSN", Kinetics400Metrics) regist_metrics("STNET", Kinetics400Metrics) -regist_metrics("NONLOCAL", MulticropMetrics) diff --git a/PaddleCV/video/models/__init__.py b/PaddleCV/video/models/__init__.py index 006e373d..72ee303f 100644 --- a/PaddleCV/video/models/__init__.py +++ b/PaddleCV/video/models/__init__.py @@ -1,15 +1,17 @@ from .model import regist_model, get_model from .attention_cluster import AttentionCluster +from .attention_lstm import AttentionLSTM from .nextvlad import NEXTVLAD +from .nonlocal_model import NonLocal +from .tsm import TSM from .tsn import TSN from .stnet import STNET -from .attention_lstm import AttentionLSTM -from .nonlocal_model import NonLocal -# regist models +# regist models, sort by alphabet regist_model("AttentionCluster", AttentionCluster) +regist_model("AttentionLSTM", AttentionLSTM) regist_model("NEXTVLAD", NEXTVLAD) +regist_model('NONLOCAL', NonLocal) +regist_model("TSM", TSM) regist_model("TSN", TSN) regist_model("STNET", STNET) -regist_model("AttentionLSTM", AttentionLSTM) -regist_model('NONLOCAL', NonLocal) diff --git a/PaddleCV/video/models/tsm/README.md b/PaddleCV/video/models/tsm/README.md new file mode 100644 index 00000000..e3be1998 --- /dev/null +++ b/PaddleCV/video/models/tsm/README.md @@ -0,0 +1,83 @@ +# TSM 视频分类模型 + +--- +## 内容 + +- [模型简介](#模型简介) +- [数据准备](#数据准备) +- [模型训练](#模型训练) +- [模型评估](#模型评估) +- [模型推断](#模型推断) +- [参考论文](#参考论文) + + +## 模型简介 + +TSM(Temporal Shift Module),Backbone采用ResNet-50结构。 + +详细内容请参考论文[Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383) + +## 数据准备 + +TSM的训练数据采用由DeepMind公布的Kinetics-400动作识别数据集。数据下载及准备请参考[数据说明](../../dataset/README.md) + +## 模型训练 + +数据准备完毕后,可以通过如下两种方式启动训练: + + python train.py --model-name=TSM + --config=./configs/tsm.txt + --save-dir=checkpoints + --log-interval=10 + --valid-interval=1 + + bash scripts/train/train_tsm,.sh + +- 可下载已发布模型[model](https://paddlemodels.bj.bcebos.com/video_classification/tsm_kinetics.tar.gz)通过`--resume`指定权重存放路径进行finetune等开发 + +**数据读取器说明:** 模型读取Kinetics-400数据集中的`mp4`数据,每条数据抽取`seg_num`段,每段抽取1帧图像,对每帧图像做随机增强后,缩放至`target_size`。 + +**训练策略:** + +* 采用Momentum优化算法训练,momentum=0.9 +* 权重衰减系数为1e-4 + +## 模型评估 + +可通过如下两种方式进行模型评估: + + python test.py --model-name=TSM + --config=configs/tsm.txt + --log-interval=1 + --weights=$PATH_TO_WEIGHTS + + bash scripts/test/test_tsm.sh + +- 使用`scripts/test/test_tsm.sh`进行评估时,需要修改脚本中的`--weights`参数指定需要评估的权重。 + +- 若未指定`--weights`参数,脚本会下载已发布模型[model](https://paddlemodels.bj.bcebos.com/video_classification/tsm_kinetics.tar.gz)进行评估 + +当取如下参数时,在Kinetics400的validation数据集下评估精度如下: + +| seg\_num | target\_size | Top-1 | +| :------: | :----------: | :----: | +| 8 | 224 | 0.70 | + +## 模型推断 + +可通过如下命令进行模型推断: + + python infer.py --model-name=TSM + --config=configs/tsm.txt + --log-interval=1 + --weights=$PATH_TO_WEIGHTS + --filelist=$FILELIST + +- 模型推断结果存储于`TSM_infer_result`中,通过`pickle`格式存储。 + +- 若未指定`--weights`参数,脚本会下载已发布模型[model](https://paddlemodels.bj.bcebos.com/video_classification/tsm_kinetics.tar.gz)进行推断 + +## 参考论文 + +- [Temporal Shift Module for Efficient Video Understanding](https://arxiv.org/abs/1811.08383), Ji Lin, Chuang Gan, Song Han + diff --git a/PaddleCV/video/models/tsm/__init__.py b/PaddleCV/video/models/tsm/__init__.py new file mode 100644 index 00000000..0a939d36 --- /dev/null +++ b/PaddleCV/video/models/tsm/__init__.py @@ -0,0 +1 @@ +from .tsm import * diff --git a/PaddleCV/video/models/tsm/tsm.py b/PaddleCV/video/models/tsm/tsm.py new file mode 100644 index 00000000..7d8dcc77 --- /dev/null +++ b/PaddleCV/video/models/tsm/tsm.py @@ -0,0 +1,138 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import paddle.fluid as fluid +from paddle.fluid import ParamAttr + +from ..model import ModelBase +from .tsm_res_model import TSM_ResNet + +import logging +logger = logging.getLogger(__name__) + +__all__ = ["TSM"] + + +class TSM(ModelBase): + def __init__(self, name, cfg, mode='train'): + super(TSM, self).__init__(name, cfg, mode=mode) + self.get_config() + + def get_config(self): + self.num_classes = self.get_config_from_sec('model', 'num_classes') + self.seg_num = self.get_config_from_sec('model', 'seg_num') + self.seglen = self.get_config_from_sec('model', 'seglen') + self.image_mean = self.get_config_from_sec('model', 'image_mean') + self.image_std = self.get_config_from_sec('model', 'image_std') + self.num_layers = self.get_config_from_sec('model', 'num_layers') + + self.num_epochs = self.get_config_from_sec('train', 'epoch') + self.total_videos = self.get_config_from_sec('train', 'total_videos') + self.base_learning_rate = self.get_config_from_sec('train', + 'learning_rate') + self.learning_rate_decay = self.get_config_from_sec( + 'train', 'learning_rate_decay') + self.decay_epochs = self.get_config_from_sec('train', 'decay_epochs') + self.l2_weight_decay = self.get_config_from_sec('train', + 'l2_weight_decay') + self.momentum = self.get_config_from_sec('train', 'momentum') + + self.target_size = self.get_config_from_sec(self.mode, 'target_size') + self.batch_size = self.get_config_from_sec(self.mode, 'batch_size') + + def build_input(self, use_pyreader=True): + image_shape = [3, self.target_size, self.target_size] + image_shape[0] = image_shape[0] * self.seglen + image_shape = [self.seg_num] + image_shape + self.use_pyreader = use_pyreader + if use_pyreader: + assert self.mode != 'infer', \ + 'pyreader is not recommendated when infer, please set use_pyreader to be false.' + py_reader = fluid.layers.py_reader( + capacity=100, + shapes=[[-1] + image_shape, [-1] + [1]], + dtypes=['float32', 'int64'], + name='train_py_reader' + if self.is_training else 'test_py_reader', + use_double_buffer=True) + image, label = fluid.layers.read_file(py_reader) + self.py_reader = py_reader + else: + image = fluid.layers.data( + name='image', shape=image_shape, dtype='float32') + if self.mode != 'infer': + label = fluid.layers.data( + name='label', shape=[1], dtype='int64') + else: + label = None + self.feature_input = [image] + self.label_input = label + + def build_model(self): + videomodel = TSM_ResNet( + layers=self.num_layers, + seg_num=self.seg_num, + is_training=self.is_training) + out = videomodel.net(input=self.feature_input[0], + class_dim=self.num_classes) + self.network_outputs = [out] + + def optimizer(self): + assert self.mode == 'train', "optimizer only can be get in train mode" + total_videos = self.total_videos + step = int(total_videos / self.batch_size + 1) + bd = [e * step for e in self.decay_epochs] + base_lr = self.base_learning_rate + lr_decay = self.learning_rate_decay + lr = [base_lr, base_lr * lr_decay, base_lr * lr_decay * lr_decay] + l2_weight_decay = self.l2_weight_decay + momentum = self.momentum + optimizer = fluid.optimizer.Momentum( + learning_rate=fluid.layers.piecewise_decay( + boundaries=bd, values=lr), + momentum=momentum, + regularization=fluid.regularizer.L2Decay(l2_weight_decay)) + + return optimizer + + def loss(self): + assert self.mode != 'infer', "invalid loss calculationg in infer mode" + cost = fluid.layers.cross_entropy(input=self.network_outputs[0], \ + label=self.label_input, ignore_index=-1) + self.loss_ = fluid.layers.mean(x=cost) + return self.loss_ + + def outputs(self): + return self.network_outputs + + def feeds(self): + return self.feature_input if self.mode == 'infer' else self.feature_input + [ + self.label_input + ] + + def pretrain_info(self): + return ('ResNet50_pretrained', 'https://paddlemodels.bj.bcebos.com/video_classification/ResNet50_pretrained.tar.gz') + + def weights_info(self): + return ('tsm_kinetics', + 'https://paddlemodels.bj.bcebos.com/video_classification/tsm_kinetics.tar.gz') + + def load_pretrain_params(self, exe, pretrain, prog, place): + def is_parameter(var): + return isinstance(var, fluid.framework.Parameter) and (not ("fc_0" in var.name)) + + logger.info("Load pretrain weights from {}, exclude fc layer.".format(pretrain)) + vars = filter(is_parameter, prog.list_vars()) + fluid.io.load_vars(exe, pretrain, vars=vars, main_program=prog) + diff --git a/PaddleCV/video/models/tsm/tsm_res_model.py b/PaddleCV/video/models/tsm/tsm_res_model.py new file mode 100644 index 00000000..f40a0771 --- /dev/null +++ b/PaddleCV/video/models/tsm/tsm_res_model.py @@ -0,0 +1,154 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +import os +import time +import sys +import paddle.fluid as fluid +import math + + +class TSM_ResNet(): + def __init__(self, layers=50, seg_num=8, is_training=False): + self.layers = layers + self.seg_num = seg_num + self.is_training = is_training + + def shift_module(self, input): + output = fluid.layers.temporal_shift(input, self.seg_num, 1.0 / 8) + return output + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + name=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + param_attr=fluid.param_attr.ParamAttr(name=name+"_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + + return fluid.layers.batch_norm(input=conv, act=act, + is_test=(not self.is_training), + param_attr=fluid.param_attr.ParamAttr(name=bn_name+"_scale"), + bias_attr=fluid.param_attr.ParamAttr(bn_name+'_offset'), + moving_mean_name=bn_name+"_mean", + moving_variance_name=bn_name+'_variance') + + def shortcut(self, input, ch_out, stride, name): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + return self.conv_bn_layer(input, ch_out, 1, stride, name=name) + else: + return input + + def bottleneck_block(self, input, num_filters, stride, name): + shifted = self.shift_module(input) + + conv0 = self.conv_bn_layer( + input=shifted, num_filters=num_filters, filter_size=1, act='relu', + name=name+"_branch2a") + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu', name=name+"_branch2b") + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 4, filter_size=1, act=None, name=name+"_branch2c") + + short = self.shortcut(input, num_filters * 4, stride, name=name+"_branch1") + + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + def net(self, input, class_dim=101): + layers = self.layers + seg_num = self.seg_num + supported_layers = [50, 101, 152] + if layers not in supported_layers: + print("supported layers are", supported_layers, \ + "but input layer is ", layers) + exit() + + # reshape input + channels = input.shape[2] + short_size = input.shape[3] + input = fluid.layers.reshape( + x=input, shape=[-1, channels, short_size, short_size]) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + conv = self.conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu', name='conv1') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + if layers in [101, 152] and block == 2: + if i == 0: + conv_name = "res" + str(block+2) + "a" + else: + conv_name = "res" + str(block+2) + "b" + str(i) + else: + conv_name = "res" + str(block+2) + chr(97+i) + + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + name=conv_name) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + + dropout = fluid.layers.dropout(x=pool, dropout_prob=0.5, is_test=(not self.is_training)) + + feature = fluid.layers.reshape( + x=dropout, shape=[-1, seg_num, pool.shape[1]]) + out = fluid.layers.reduce_mean(feature, dim=1) + + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + out = fluid.layers.fc(input=out, + size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, + stdv)), + bias_attr=fluid.param_attr.ParamAttr(learning_rate=2.0, + regularizer=fluid.regularizer.L2Decay(0.))) + return out -- GitLab