From 2e011b696a766b42bf70796485904a6693fd5b03 Mon Sep 17 00:00:00 2001
From: huangjun12 <2399845970@qq.com>
Date: Thu, 17 Nov 2022 14:12:09 +0800
Subject: [PATCH] add PP-TSM (#5565)
* add PP-TSM
* remove log file
* add description
* fix Example in info.yaml
* fix notebook
* update task to video
---
modelcenter/PP-TSM/.gitkeep | 0
modelcenter/PP-TSM/APP/app.py | 42 +++
modelcenter/PP-TSM/APP/app.yml | 11 +
modelcenter/PP-TSM/APP/requirements.txt | 3 +
modelcenter/PP-TSM/benchmark_cn.md | 38 +++
modelcenter/PP-TSM/benchmark_en.md | 35 +++
modelcenter/PP-TSM/download_cn.md | 5 +
modelcenter/PP-TSM/download_en.md | 5 +
modelcenter/PP-TSM/info.yaml | 29 ++
modelcenter/PP-TSM/introduction_cn.ipynb | 326 ++++++++++++++++++++++
modelcenter/PP-TSM/introduction_en.ipynb | 328 +++++++++++++++++++++++
11 files changed, 822 insertions(+)
delete mode 100644 modelcenter/PP-TSM/.gitkeep
create mode 100644 modelcenter/PP-TSM/APP/app.py
create mode 100644 modelcenter/PP-TSM/APP/app.yml
create mode 100644 modelcenter/PP-TSM/APP/requirements.txt
create mode 100644 modelcenter/PP-TSM/benchmark_cn.md
create mode 100644 modelcenter/PP-TSM/benchmark_en.md
create mode 100644 modelcenter/PP-TSM/download_cn.md
create mode 100644 modelcenter/PP-TSM/download_en.md
create mode 100644 modelcenter/PP-TSM/info.yaml
create mode 100644 modelcenter/PP-TSM/introduction_cn.ipynb
create mode 100644 modelcenter/PP-TSM/introduction_en.ipynb
diff --git a/modelcenter/PP-TSM/.gitkeep b/modelcenter/PP-TSM/.gitkeep
deleted file mode 100644
index e69de29b..00000000
diff --git a/modelcenter/PP-TSM/APP/app.py b/modelcenter/PP-TSM/APP/app.py
new file mode 100644
index 00000000..c842bf96
--- /dev/null
+++ b/modelcenter/PP-TSM/APP/app.py
@@ -0,0 +1,42 @@
+import gradio as gr
+
+from ppvideo import PaddleVideo
+
+
+TOPK = 5
+pv = PaddleVideo(model_name='ppTSM', use_gpu=False, top_k=TOPK)
+
+
+# Define the model inference
+def model_inference(video_file):
+ result = pv.predict(video_file)[0][0]
+ topk_scores = result['topk_scores']
+ label_names = result['label_names']
+
+ output = {label_names[i]: float(topk_scores[i]) for i in range(TOPK)}
+
+ return output
+
+
+def clear_all():
+ return None, None
+
+
+with gr.Blocks() as demo:
+ gr.Markdown("PP-TSM")
+
+ with gr.Column(scale=1, min_width=100):
+ video_in = gr.Video(
+ value="https://videotag.bj.bcebos.com/Data/swim.mp4",
+ label="Input (Some formats cannot be previewed)")
+
+ with gr.Row():
+ btn1 = gr.Button("Clear")
+ btn2 = gr.Button("Submit")
+ outputs = gr.outputs.Label(num_top_classes=TOPK)
+
+ btn2.click(fn=model_inference, inputs=video_in, outputs=outputs)
+ btn1.click(fn=clear_all, inputs=None, outputs=[video_in, outputs])
+ gr.Button.style(1)
+
+demo.launch()
\ No newline at end of file
diff --git a/modelcenter/PP-TSM/APP/app.yml b/modelcenter/PP-TSM/APP/app.yml
new file mode 100644
index 00000000..89c2d3da
--- /dev/null
+++ b/modelcenter/PP-TSM/APP/app.yml
@@ -0,0 +1,11 @@
+【PP-TSM-App-YAML】
+
+APP_Info:
+ title: PP-TSM-App
+ colorFrom: blue
+ colorTo: yellow
+ sdk: gradio
+ sdk_version: 3.4.1
+ app_file: app.py
+ license: apache-2.0
+ device: cpu
\ No newline at end of file
diff --git a/modelcenter/PP-TSM/APP/requirements.txt b/modelcenter/PP-TSM/APP/requirements.txt
new file mode 100644
index 00000000..910b10c7
--- /dev/null
+++ b/modelcenter/PP-TSM/APP/requirements.txt
@@ -0,0 +1,3 @@
+gradio
+paddlepaddle
+ppvideo==2.3.1
diff --git a/modelcenter/PP-TSM/benchmark_cn.md b/modelcenter/PP-TSM/benchmark_cn.md
new file mode 100644
index 00000000..b9665079
--- /dev/null
+++ b/modelcenter/PP-TSM/benchmark_cn.md
@@ -0,0 +1,38 @@
+## 1. 训练Benchmark
+
+### 1.1 软硬件环境
+
+* PP-TSM模型训练过程中使用8 GPUs,每GPU batch size为16进行训练,如训练GPU数和batch size不使用上述配置,需要线性调整学习率和迭代次数。
+
+### 1.2 数据集
+
+PP-TSM模型使用Kinetics-400数据集进行训练和测试。
+
+### 1.3 指标
+
+|模型名称 | 模型简介 | 输入尺寸 | 输入帧数 | ips |
+|---|---|---|---|---|
+|pptsm_k400_frames_uniform | 行为识别 | 224 | 8 | 274.32 |
+
+
+
+## 2. 推理 Benchmark
+
+### 2.1 软硬件环境
+
+* PP-TSM模型推理速度测试采用单卡V100,batch size=1进行测试,使用CUDA 10.2, CUDNN 8.1.1,TensorRT推理速度测试使用TensorRT 7.0.0.11。
+
+
+### 2.2 数据集
+
+PP-TSM模型使用Kinetics-400数据集进行训练和测试。
+
+### 2.3 指标
+
+|模型名称 | 精度% | 预处理时间ms | 模型推理时间ms | 预测总时间ms |
+| :---- | :----: | :----: |:----: |:----: |
+|pptsm_k400_frames_uniform | 75.11 | 51.84 | 11.26 | 63.1 |
+
+## 3. 参考
+
+参考文档: https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/benchmark.md
\ No newline at end of file
diff --git a/modelcenter/PP-TSM/benchmark_en.md b/modelcenter/PP-TSM/benchmark_en.md
new file mode 100644
index 00000000..eee178c8
--- /dev/null
+++ b/modelcenter/PP-TSM/benchmark_en.md
@@ -0,0 +1,35 @@
+## 1. Training Benchmark
+
+### 1.1 Environment
+
+* The training process of PP-TSM model uses 8 GPUs, every GPU batch size is 16 for training. If the number GPU and batch size of training do not use the above configuration, you shouldadjust the learning rate and number of iterations.
+
+### 1.2 Datasets
+The PP-TSM model uses Kinetics-400 dataset as the training and test set.
+
+### 1.3 Benchmark
+
+|Model name | task | input size | frames | ips |
+|---|---|---|---|---|
+|pptsm_k400_frames_uniform | action recognition | 224 | 8 | 274.32 |
+
+
+## 2. Inference Benchmark
+
+### 2.1 Environment
+
+* The PP-TSM model's inference speed test is tested with single-card V100, batch size=1, CUDA 10.2, CUDNN 8.1.1, and TensorRT inference speed test using TensorRT 7.0.0.11.
+
+### 2.2 Datasets
+
+The PP-TSM model uses Kinetics-400 dataset as the training and test set.
+
+### 2.3 Benchmark
+
+| Model name | Accuracy% | Preprocess time/ms | Inference time/ms | Total inference time/ms |
+| :---- | :----: | :----: |:----: |:----: |
+|pptsm_k400_frames_uniform | 75.11 | 51.84 | 11.26 | 63.1 |
+
+
+## 3. Reference
+Ref: https://github.com/PaddlePaddle/PaddleVideo/blob/develop/docs/zh-CN/benchmark.md
diff --git a/modelcenter/PP-TSM/download_cn.md b/modelcenter/PP-TSM/download_cn.md
new file mode 100644
index 00000000..dce90399
--- /dev/null
+++ b/modelcenter/PP-TSM/download_cn.md
@@ -0,0 +1,5 @@
+# 下载
+
+|模型名称 | 模型简介 | 输入尺寸 | 下载地址 |
+|---|---|---|---|
+|pptsm_k400_frames_uniform | 行为识别 | 224 | [推理模型](https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_infer.tar)/[预训练模型](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams) |
diff --git a/modelcenter/PP-TSM/download_en.md b/modelcenter/PP-TSM/download_en.md
new file mode 100644
index 00000000..f6f6f392
--- /dev/null
+++ b/modelcenter/PP-TSM/download_en.md
@@ -0,0 +1,5 @@
+# Download
+
+| model | task | input size | download url|
+|---|---|---|---|
+|pptsm_k400_frames_uniform | action recognition | 224 | [InferenceModel](https://videotag.bj.bcebos.com/PaddleVideo/InferenceModel/ppTSM_infer.tar)/[PretrainModel](https://videotag.bj.bcebos.com/PaddleVideo-release2.1/PPTSM/ppTSM_k400_uniform_distill.pdparams) |
diff --git a/modelcenter/PP-TSM/info.yaml b/modelcenter/PP-TSM/info.yaml
new file mode 100644
index 00000000..402f80bf
--- /dev/null
+++ b/modelcenter/PP-TSM/info.yaml
@@ -0,0 +1,29 @@
+---
+Model_Info:
+ name: "PP-TSM"
+ description: "PP-TSM是PaddleVideo自研的实用产业级视频分类模型,考虑精度和速度的平衡,进行模型瘦身和精度优化,在Kinetics-400数据集上精度较原论文提升4个点。"
+ description_en: "PP-TSM is a practical industrial video classification model developed by PaddleVideo. We slim the model size and optimize the accuracy with the considerations of the trade-off between speed and precision and achieve 76.16% in Kinetics-400 dataset."
+ icon: "@后续UE统一设计之后,会存到bos上某个位置"
+ from_repo: "PaddleVideo"
+Task:
+- tag_en: "Video"
+ tag: "视频"
+ sub_tag_en: "Short Video Content Classification"
+ sub_tag: "短视频内容分类"
+Example:
+- tag: "文体互娱"
+ tag_en: "Entertainment"
+ sub_tag: "时刻剪辑"
+ sub_tag_en: "Moment editing"
+ title: "基于PaddleVideo的足球精彩时刻剪辑"
+ title_en: "Soccer video highlights clip based on PaddleVideo"
+ url: "https://aistudio.baidu.com/aistudio/projectdetail/3473391?contributionType=1"
+ url_en:
+Datasets: "Kinetics-400, UCF-101"
+Pulisher: "Baidu"
+License: "apache.2.0"
+Paper:
+- title: "TSM: Temporal Shift Module for Efficient Video Understanding"
+ url: "https://arxiv.org/pdf/1811.08383.pdf"
+IfTraining: 1
+IfOnlineDemo: 1
diff --git a/modelcenter/PP-TSM/introduction_cn.ipynb b/modelcenter/PP-TSM/introduction_cn.ipynb
new file mode 100644
index 00000000..8e226295
--- /dev/null
+++ b/modelcenter/PP-TSM/introduction_cn.ipynb
@@ -0,0 +1,326 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. PP-TSM模型简介\n",
+ "\n",
+ "视频分类与图像分类相似,均属于识别任务,对于给定的输入视频,视频分类模型需要输出其预测的标签类别。如果标签都是行为类别,则该任务也常被称为行为识别。与图像分类不同的是,视频分类往往需要利用多帧图像之间的时序信息。PP-TSM是PaddleVideo自研的实用产业级视频分类模型,在实现前沿算法的基础上,考虑精度和速度的平衡,进行模型瘦身和精度优化,使其可能满足产业落地需求。\n",
+ "\n",
+ "PP-TSM基于ResNet-50骨干网络进行优化,从数据增强、网络结构微调、训练策略、BN层优化、预训练模型选择、模型蒸馏等6个方面进行模型调优。在基本不增加计算量的前提下,使用中心采样评估方式,PP-TSM在Kinetics-400上精度较原论文实现提升3.95个点,达到76.16%,超过同等骨干网络下的3D模型,且推理速度快4.5倍!\n",
+ "\n",
+ "更多关于PaddleVideo可以点击 https://github.com/PaddlePaddle/PaddleVideo 进行了解。\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. 模型效果及应用场景\n",
+ "### 2.1 视频分类任务:\n",
+ "\n",
+ "#### 2.1.1 数据集:\n",
+ "\n",
+ "数据集以Kinetics-400为主,分为训练集和测试集。\n",
+ "\n",
+ "#### 2.1.2 模型效果速览:\n",
+ "\n",
+ "PP-TSM在视频上的预测效果为:\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. 模型如何使用\n",
+ "\n",
+ "### 3.1 模型推理:\n",
+ "* 下载 \n",
+ "\n",
+ "(不在Jupyter Notebook上运行时需要将\"!\"或者\"%\"去掉。)\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "%cd ~/work\n",
+ "# 克隆PaddleVideo(从gitee上更快),本项目以做持久化处理,不用克隆了。\n",
+ "!git clone https://github.com/PaddlePaddle/PaddleVideo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 安装"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# 运行脚本需在PaddleVideo目录下\n",
+ "%cd ~/work/PaddleVideo/\n",
+ "\n",
+ "# 安装所需依赖项【已经做持久化处理,无需再安装】\n",
+ "!pip install --upgrade pip\n",
+ "!pip install -r requirements.txt --user\n",
+ "\n",
+ "# 安装PaddleVideo\n",
+ "!pip install ppvideo==2.3.0 --user"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 快速体验\n",
+ "\n",
+ "恭喜! 您已经成功安装了PaddleVideo,接下来快速体验视频分类效果"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "!ppvideo --model_name='ppTSM' --use_gpu=False --video_file='data/example.avi'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "上述代码会下载训练好的PP-TSM模型,基于CPU,对data/example.avi示例文件进行预测。\n",
+ "\n",
+ "输出日志结果如下:\n",
+ "```txt\n",
+ "Current video file: data/example.avi\n",
+ " top-1 classes: [5]\n",
+ " top-1 scores: [0.95056254]\n",
+ " top-1 label names: ['archery']\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.2 模型训练:\n",
+ "* 克隆PaddleVideo仓库(详见3.1)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 准备数据集和预训练模型\n",
+ "\n",
+ "下面以Kinetics-400小数据集为例,演示模型训练过程。开发者也可以参考该数据格式,准备自己的训练数据。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 进入PaddleVideo工作目录\n",
+ "%cd ~/work/PaddleVideo/\n",
+ "\n",
+ "# 下载Kinetics-400小数据集\n",
+ "%pushd ./data/k400\n",
+ "!wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n",
+ "!tar -xf k400_videos_small.tar\n",
+ "%popd\n",
+ "\n",
+ "# 下载预训练模型\n",
+ "!wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 修改yaml配置文件\n",
+ "\n",
+ "\n",
+ "修改配置文件` configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml`中的数据标注文件路径\n",
+ "\n",
+ "```\n",
+ "DATASET: #DATASET field\n",
+ " batch_size: 4 # 根据GPU显存适当修改\n",
+ " num_workers: 4 \n",
+ " test_batch_size: 1\n",
+ " train:\n",
+ " format: \"VideoDataset\" \n",
+ " data_prefix: \"data/k400/videos\" \n",
+ " file_path: \"data/k400/train_small_videos.list \" #修改训练集路径\n",
+ " valid:\n",
+ " format: \"VideoDataset\" \n",
+ " data_prefix: \"data/k400/videos\" \n",
+ " file_path: \"data/k400/val_small_videos.list\" #修改验证集路径\n",
+ " test:\n",
+ " format: \"VideoDataset\"\n",
+ " data_prefix: \"data/k400/videos\" \n",
+ " file_path: \"data/k400/val_small_videos.list\" #修改验证集路径\n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 训练模型"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "%cd ~/work/PaddleVideo/\n",
+ "%env CUDA_VISIBLE_DEVICES=0\n",
+ "#开始训练\n",
+ "!python main.py --validate -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 模型评估\n",
+ "\n",
+ "在训练时设置`--validate`,可以在训练时同步进行评估。对于训练好的模型,也可以使用如下命令进行评估。通过`-c`参数指定配置文件,`-w`参数指定待评估的模型。"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "%cd ~/work/PaddleVideo/\n",
+ "%env CUDA_VISIBLE_DEVICES=0\n",
+ "\n",
+ "#训练完以后,进行评估\n",
+ "!python main.py --test -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml -w output/ppTSM/ppTSM_best.pdparams"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. 模型原理"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 采用 Temporal Shift Module(时序位移模块)\n",
+ "\n",
+ "PP-TSM 使用时序位移模块提取时序特征。通过通道移动的方法,在不增加任何额外参数量和计算量的情况下,极大地提升了模型对于视频时间信息的利用能力。\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* 数据增强 VideoMix\n",
+ "\n",
+ "对于视频Mix-up,PP-TSM将两个视频以一定的权值叠加构成新的输入视频,提升网络在时空上的抗干扰能力。\n",
+ "\n",
+ "* 精确BN precise BN\n",
+ "\n",
+ "为了获取更加精确的均值和方差供BN层在测试时使用,在实验中,我们会在网络训练完一个Epoch后,固定住网络中的参数不动,然后将训练数据输入网络做前向计算,保存下来每个step的均值和方差,最终得到所有训练样本精确的均值和方差,提升测试精度。"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. 注意事项\n",
+ "\n",
+ "PP-TSM模型提供的各配置文件均放置在configs/recognition/pptsm目录下,配置文件名按如下格式组织:\n",
+ "```模型名称_骨干网络名称_数据集名称_数据格式_测试方式_其它.yaml```\n",
+ "\n",
+ "* 数据格式包括frame和video,video表示使用在线解码的方式进行训练,frame表示先将视频解码成图像帧存储起来,训练时直接读取图片进行训练。使用不同数据格式,仅需修改配置文件中的DATASET和PIPELINE字段,参考pptsm_k400_frames_uniform.yaml和pptsm_k400_videos_uniform.yaml。注意,由于编解码的细微差异,两种格式训练得到的模型在精度上可能会有些许差异。\n",
+ "\n",
+ "* 测试方式包括uniform和dense,uniform表示中心采样,dense表示密集采样。\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. 相关论文以及引用信息\n",
+ "\n",
+ "```\n",
+ "@inproceedings{lin2019tsm,\n",
+ " title={TSM: Temporal Shift Module for Efficient Video Understanding},\n",
+ " author={Lin, Ji and Gan, Chuang and Han, Song},\n",
+ " booktitle={Proceedings of the IEEE International Conference on Computer Vision},\n",
+ " year={2019}\n",
+ "} \n",
+ "```\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/modelcenter/PP-TSM/introduction_en.ipynb b/modelcenter/PP-TSM/introduction_en.ipynb
new file mode 100644
index 00000000..b5bb9182
--- /dev/null
+++ b/modelcenter/PP-TSM/introduction_en.ipynb
@@ -0,0 +1,328 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 1. PP-TSM Introduction\n",
+ "\n",
+ "Video classification is similar to image classification which belongs to the recognition task. For a given input video, the video classification model aims to output its predicted label category. If tags are all action categories, this task is also called action recognition. Different from image classification, video classification often requires the use of temporal information between multiple frames of images. PP-TSM is a practical industrial video classification model developed by PaddleVideo. Based on the implementation of state-of-the-art algorithms, we slim the model size and optimize the accuracy with the considerations of the trade-off between speed and precision.\n",
+ "\n",
+ "PP-TSM is produced based on ResNet-50 backbone. Optimized methods includes data augmentation, network structure fine-tuning, training strategy, preciceBN, pretrain model selection and model distillation. Under the premise of basically not increasing the amount of calculation, using the center-sampling evaluation method, the accuracy of PP-TSM on Kinetics-400 is 3.95 points higher than that of the original paper, reaching 76.16%, which exceeds the 3D model under the same backbone network, and the inference speed is 4.5 times faster!\n",
+ "\n",
+ "More information about PaddleVideo can be found here https://github.com/PaddlePaddle/PaddleVideo .\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 2. Model Effects and Application Scenarios\n",
+ "### 2.1 action recognition Tasks:\n",
+ "\n",
+ "#### 2.1.1 Datasets:\n",
+ "\n",
+ "The dataset is mainly in Kinetics-400, which is divided into training set and test set.\n",
+ "\n",
+ "#### 2.1.2 Model Effects:\n",
+ "\n",
+ "The recognition effect of PP-TSM on the picture is:\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 3. How to Use the Model\n",
+ "\n",
+ "### 3.1 Model Inference:\n",
+ "* Download "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "collapsed": false,
+ "jupyter": {
+ "outputs_hidden": false
+ },
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "%cd ~/work\n",
+ "\n",
+ "!git clone https://github.com/PaddlePaddle/PaddleVideo"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Installation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "# The script needs to be run in the PaddleVideo directory\n",
+ "%cd ~/work/PaddleVideo/\n",
+ "\n",
+ "# Install the required dependencies [already persisted, no need to install again].\n",
+ "!pip install --upgrade pip\n",
+ "!pip install -r requirements.txt --user\n",
+ "\n",
+ "# Install PaddleVideo\n",
+ "!pip install ppvideo==2.3.0 --user"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Quick experience\n",
+ "\n",
+ "Congratulations! Now that you've successfully installed PaddleVideo, let's get a quick feel at action recognition."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "!ppvideo --model_name='ppTSM' --use_gpu=False --video_file='data/example.avi'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "An video with the predicted result is generated.\n",
+ "\n",
+ "The result is as follows:\n",
+ "\n",
+ "```txt\n",
+ "Current video file: data/example.avi\n",
+ " top-1 classes: [5]\n",
+ " top-1 scores: [0.95056254]\n",
+ " top-1 label names: ['archery']\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3.2 Model Training\n",
+ "* Clone the PaddleVideo repository (see 3.1 for details)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Prepare the datasets and pretrain model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# 进入PaddleVideo工作目录\n",
+ "%cd ~/work/PaddleVideo/\n",
+ "\n",
+ "# 下载Kinetics-400小数据集\n",
+ "%pushd ./data/k400\n",
+ "!wget -nc https://videotag.bj.bcebos.com/Data/k400_videos_small.tar\n",
+ "!tar -xf k400_videos_small.tar\n",
+ "%popd\n",
+ "\n",
+ "# 下载预训练模型\n",
+ "!wget -nc -P ./data https://videotag.bj.bcebos.com/PaddleVideo/PretrainModel/ResNet50_vd_ssld_v2_pretrained.pdparams --no-check-certificate\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Change yaml configurations files.\n",
+ "\n",
+ "\n",
+ "Change yaml configurations files` configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml`\n",
+ "\n",
+ "```\n",
+ "DATASET: #DATASET field\n",
+ " batch_size: 4 # adjust according to GPU memory\n",
+ " num_workers: 4 \n",
+ " test_batch_size: 1\n",
+ " train:\n",
+ " format: \"VideoDataset\" \n",
+ " data_prefix: \"data/k400/videos\" \n",
+ " file_path: \"data/k400/train_small_videos.list \" # modify train dataset path\n",
+ " valid:\n",
+ " format: \"VideoDataset\" \n",
+ " data_prefix: \"data/k400/videos\" \n",
+ " file_path: \"data/k400/val_small_videos.list\" #modify validation dataset path\n",
+ " test:\n",
+ " format: \"VideoDataset\"\n",
+ " data_prefix: \"data/k400/videos\" \n",
+ " file_path: \"data/k400/val_small_videos.list\" #modify validation dataset path\n",
+ "```\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Train the model."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true,
+ "tags": []
+ },
+ "outputs": [],
+ "source": [
+ "%cd ~/work/PaddleVideo/\n",
+ "%env CUDA_VISIBLE_DEVICES=0\n",
+ "\n",
+ "# Start training\n",
+ "!python main.py --validate -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Model evaluation\n",
+ "\n",
+ "Setting `--validate` in train script, the evaluation will be implemented during training. For a trained model, the evaluation code as follows.\n",
+ "\n",
+ "`-c` specify configuration file, `-w` specify checkpoints."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "%cd ~/work/PaddleVideo/\n",
+ "%env CUDA_VISIBLE_DEVICES=0\n",
+ "\n",
+ "!python main.py --test -c configs/recognition/pptsm/pptsm_k400_videos_uniform.yaml -w output/ppTSM/ppTSM_best.pdparams"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 4. Model Principles"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Temporal Shift Module\n",
+ "\n",
+ "PP-TSM use Temporal Shift Module to learn temporal information. This method greatly improves the model's ability to use the video information without adding any additional parameters or computation.\n",
+ "\n",
+ "\n",
+ "
\n",
+ "
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "* Data Augmentation VideoMix\n",
+ "\n",
+ "For Video Mix-up,The two videos are mixed with a certain weight value to form a new input video, which improves the space-time capability of the network.\n",
+ "\n",
+ "* precise BN\n",
+ "\n",
+ "In order to obtain more accurate mean and variance for BN layer to use in testing, in the experiment, we fix the parameters in the network after the network has trained an Epoch, then input the training data into the network for forward calculation, save the mean and square error of each step, and finally obtain the accurate mean and variance of all training samples to improve the testing accuracy."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 5. Note\n",
+ "\n",
+ "Each configuration file provided by the PP-TSM model is placed in the `configs/recognition/pptsm` directory. The configuration file name is organized in the following format:\n",
+ "\n",
+ "```ModelName_Backbone_Datasest_DataFormat_EvaluationMethod_others.yaml```\n",
+ "\n",
+ "* Data format includes frame and video, video indicates training with online video, frame indicates training with offline frames. Due to the differences caused by decoding, the models obtained from the training of this two formats may have some differences in accuracy.\n",
+ "\n",
+ "* The evaluation methods include uniform and dense. Uniform means central sampling, and dense means dense sampling.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## 6. Related papers and citations\n",
+ "\n",
+ "```\n",
+ "@inproceedings{lin2019tsm,\n",
+ " title={TSM: Temporal Shift Module for Efficient Video Understanding},\n",
+ " author={Lin, Ji and Gan, Chuang and Han, Song},\n",
+ " booktitle={Proceedings of the IEEE International Conference on Computer Vision},\n",
+ " year={2019}\n",
+ "} \n",
+ "```\n"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.8"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
--
GitLab