Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
5d166b57
M
models
项目概览
PaddlePaddle
/
models
1 年多 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5d166b57
编写于
10月 30, 2019
作者:
H
huangjun12
提交者:
SunGaofeng
10月 30, 2019
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
synchronize PaddleVideo develop with release/1.6 (#3812)
上级
97aab0ef
变更
66
隐藏空白更改
内联
并排
Showing
66 changed file
with
3111 addition
and
346 deletion
+3111
-346
PaddleCV/PaddleVideo/README.md
PaddleCV/PaddleVideo/README.md
+22
-3
PaddleCV/PaddleVideo/configs/bmn.yaml
PaddleCV/PaddleVideo/configs/bmn.yaml
+1
-1
PaddleCV/PaddleVideo/configs/bsn_tem.yaml
PaddleCV/PaddleVideo/configs/bsn_tem.yaml
+1
-1
PaddleCV/PaddleVideo/configs/ets.yaml
PaddleCV/PaddleVideo/configs/ets.yaml
+46
-0
PaddleCV/PaddleVideo/configs/tall.yaml
PaddleCV/PaddleVideo/configs/tall.yaml
+67
-0
PaddleCV/PaddleVideo/data/dataset/bmn/README.md
PaddleCV/PaddleVideo/data/dataset/bmn/README.md
+1
-5
PaddleCV/PaddleVideo/data/dataset/ctcn/README.md
PaddleCV/PaddleVideo/data/dataset/ctcn/README.md
+1
-1
PaddleCV/PaddleVideo/data/dataset/ets/README.md
PaddleCV/PaddleVideo/data/dataset/ets/README.md
+36
-0
PaddleCV/PaddleVideo/data/dataset/ets/generate_data.py
PaddleCV/PaddleVideo/data/dataset/ets/generate_data.py
+111
-0
PaddleCV/PaddleVideo/data/dataset/ets/generate_infer_data.py
PaddleCV/PaddleVideo/data/dataset/ets/generate_infer_data.py
+19
-0
PaddleCV/PaddleVideo/data/dataset/ets/generate_train_pickle.py
...eCV/PaddleVideo/data/dataset/ets/generate_train_pickle.py
+54
-0
PaddleCV/PaddleVideo/data/dataset/nonlocal/README.md
PaddleCV/PaddleVideo/data/dataset/nonlocal/README.md
+3
-3
PaddleCV/PaddleVideo/data/dataset/nonlocal/generate_filelist.py
...CV/PaddleVideo/data/dataset/nonlocal/generate_filelist.py
+28
-6
PaddleCV/PaddleVideo/data/dataset/nonlocal/generate_list.sh
PaddleCV/PaddleVideo/data/dataset/nonlocal/generate_list.sh
+0
-12
PaddleCV/PaddleVideo/data/dataset/nonlocal/generate_testlist_multicrop.py
...ideo/data/dataset/nonlocal/generate_testlist_multicrop.py
+0
-34
PaddleCV/PaddleVideo/data/dataset/tall/README.md
PaddleCV/PaddleVideo/data/dataset/tall/README.md
+27
-0
PaddleCV/PaddleVideo/data/dataset/tall/gen_infer.py
PaddleCV/PaddleVideo/data/dataset/tall/gen_infer.py
+56
-0
PaddleCV/PaddleVideo/eval.py
PaddleCV/PaddleVideo/eval.py
+22
-3
PaddleCV/PaddleVideo/inference_model.py
PaddleCV/PaddleVideo/inference_model.py
+1
-1
PaddleCV/PaddleVideo/metrics/bmn_metrics/README.md
PaddleCV/PaddleVideo/metrics/bmn_metrics/README.md
+3
-1
PaddleCV/PaddleVideo/metrics/bmn_metrics/bmn_proposal_metrics.py
...V/PaddleVideo/metrics/bmn_metrics/bmn_proposal_metrics.py
+1
-1
PaddleCV/PaddleVideo/metrics/bsn_metrics/bsn_pem_metrics.py
PaddleCV/PaddleVideo/metrics/bsn_metrics/bsn_pem_metrics.py
+11
-9
PaddleCV/PaddleVideo/metrics/bsn_metrics/bsn_tem_metrics.py
PaddleCV/PaddleVideo/metrics/bsn_metrics/bsn_tem_metrics.py
+17
-12
PaddleCV/PaddleVideo/metrics/detections/detection_metrics.py
PaddleCV/PaddleVideo/metrics/detections/detection_metrics.py
+0
-1
PaddleCV/PaddleVideo/metrics/ets_metrics/README.md
PaddleCV/PaddleVideo/metrics/ets_metrics/README.md
+9
-0
PaddleCV/PaddleVideo/metrics/ets_metrics/__init__.py
PaddleCV/PaddleVideo/metrics/ets_metrics/__init__.py
+0
-0
PaddleCV/PaddleVideo/metrics/ets_metrics/ets_metrics.py
PaddleCV/PaddleVideo/metrics/ets_metrics/ets_metrics.py
+107
-0
PaddleCV/PaddleVideo/metrics/metrics_util.py
PaddleCV/PaddleVideo/metrics/metrics_util.py
+83
-0
PaddleCV/PaddleVideo/metrics/tall_metrics/__init__.py
PaddleCV/PaddleVideo/metrics/tall_metrics/__init__.py
+0
-0
PaddleCV/PaddleVideo/metrics/tall_metrics/tall_metrics.py
PaddleCV/PaddleVideo/metrics/tall_metrics/tall_metrics.py
+297
-0
PaddleCV/PaddleVideo/models/__init__.py
PaddleCV/PaddleVideo/models/__init__.py
+4
-0
PaddleCV/PaddleVideo/models/attention_cluster/attention_cluster.py
...PaddleVideo/models/attention_cluster/attention_cluster.py
+9
-8
PaddleCV/PaddleVideo/models/attention_lstm/attention_lstm.py
PaddleCV/PaddleVideo/models/attention_lstm/attention_lstm.py
+8
-8
PaddleCV/PaddleVideo/models/bmn/README.md
PaddleCV/PaddleVideo/models/bmn/README.md
+1
-0
PaddleCV/PaddleVideo/models/bmn/bmn.py
PaddleCV/PaddleVideo/models/bmn/bmn.py
+18
-19
PaddleCV/PaddleVideo/models/bmn/bmn_utils.py
PaddleCV/PaddleVideo/models/bmn/bmn_utils.py
+0
-1
PaddleCV/PaddleVideo/models/bsn/bsn.py
PaddleCV/PaddleVideo/models/bsn/bsn.py
+35
-40
PaddleCV/PaddleVideo/models/bsn/bsn_utils.py
PaddleCV/PaddleVideo/models/bsn/bsn_utils.py
+0
-1
PaddleCV/PaddleVideo/models/ctcn/ctcn.py
PaddleCV/PaddleVideo/models/ctcn/ctcn.py
+15
-17
PaddleCV/PaddleVideo/models/ets/README.md
PaddleCV/PaddleVideo/models/ets/README.md
+107
-0
PaddleCV/PaddleVideo/models/ets/__init__.py
PaddleCV/PaddleVideo/models/ets/__init__.py
+1
-0
PaddleCV/PaddleVideo/models/ets/ets.py
PaddleCV/PaddleVideo/models/ets/ets.py
+195
-0
PaddleCV/PaddleVideo/models/ets/ets_net.py
PaddleCV/PaddleVideo/models/ets/ets_net.py
+239
-0
PaddleCV/PaddleVideo/models/model.py
PaddleCV/PaddleVideo/models/model.py
+4
-4
PaddleCV/PaddleVideo/models/nextvlad/nextvlad.py
PaddleCV/PaddleVideo/models/nextvlad/nextvlad.py
+10
-10
PaddleCV/PaddleVideo/models/nonlocal_model/nonlocal_model.py
PaddleCV/PaddleVideo/models/nonlocal_model/nonlocal_model.py
+13
-21
PaddleCV/PaddleVideo/models/nonlocal_model/nonlocal_utils.py
PaddleCV/PaddleVideo/models/nonlocal_model/nonlocal_utils.py
+181
-37
PaddleCV/PaddleVideo/models/stnet/stnet.py
PaddleCV/PaddleVideo/models/stnet/stnet.py
+73
-19
PaddleCV/PaddleVideo/models/tall/README.md
PaddleCV/PaddleVideo/models/tall/README.md
+108
-0
PaddleCV/PaddleVideo/models/tall/__init__.py
PaddleCV/PaddleVideo/models/tall/__init__.py
+1
-0
PaddleCV/PaddleVideo/models/tall/tall.py
PaddleCV/PaddleVideo/models/tall/tall.py
+168
-0
PaddleCV/PaddleVideo/models/tall/tall_net.py
PaddleCV/PaddleVideo/models/tall/tall_net.py
+151
-0
PaddleCV/PaddleVideo/models/tsm/tsm.py
PaddleCV/PaddleVideo/models/tsm/tsm.py
+8
-10
PaddleCV/PaddleVideo/models/tsn/README.md
PaddleCV/PaddleVideo/models/tsn/README.md
+1
-1
PaddleCV/PaddleVideo/models/tsn/tsn.py
PaddleCV/PaddleVideo/models/tsn/tsn.py
+8
-10
PaddleCV/PaddleVideo/predict.py
PaddleCV/PaddleVideo/predict.py
+36
-6
PaddleCV/PaddleVideo/reader/__init__.py
PaddleCV/PaddleVideo/reader/__init__.py
+4
-0
PaddleCV/PaddleVideo/reader/bmn_reader.py
PaddleCV/PaddleVideo/reader/bmn_reader.py
+44
-1
PaddleCV/PaddleVideo/reader/bsn_reader.py
PaddleCV/PaddleVideo/reader/bsn_reader.py
+88
-4
PaddleCV/PaddleVideo/reader/ets_reader.py
PaddleCV/PaddleVideo/reader/ets_reader.py
+170
-0
PaddleCV/PaddleVideo/reader/kinetics_reader.py
PaddleCV/PaddleVideo/reader/kinetics_reader.py
+2
-0
PaddleCV/PaddleVideo/reader/tall_reader.py
PaddleCV/PaddleVideo/reader/tall_reader.py
+324
-0
PaddleCV/PaddleVideo/run.sh
PaddleCV/PaddleVideo/run.sh
+1
-1
PaddleCV/PaddleVideo/train.py
PaddleCV/PaddleVideo/train.py
+22
-16
PaddleCV/PaddleVideo/utils/train_utils.py
PaddleCV/PaddleVideo/utils/train_utils.py
+18
-18
PaddleCV/PaddleVideo/utils/utility.py
PaddleCV/PaddleVideo/utils/utility.py
+20
-0
未找到文件。
PaddleCV/PaddleVideo/README.md
浏览文件 @
5d166b57
...
...
@@ -16,16 +16,18 @@
|
[
C-TCN
](
./models/ctcn/README.md
)
| 视频动作定位| 2018年ActivityNet夺冠方案 |
|
[
BSN
](
./models/bsn/README.md
)
| 视频动作定位| 为视频动作定位问题提供高效的proposal生成方法 |
|
[
BMN
](
./models/bmn/README.md
)
| 视频动作定位| 2019年ActivityNet夺冠方案 |
|
[
ETS
](
./models/ets/README.md
)
| 视频描述| ICCV'15提出的结合时序注意力机制的建模方法 |
|
[
TALL
](
./models/tall/README.md
)
| 视频查找| ICCV'17多模态时序回归定位方法 |
### 主要特点
-
包含视频分类和动作定位方向的多个主流领先模型,其中Attention LSTM,Attention Cluster和NeXtVLAD是比较流行的特征序列模型,Non-local, TSN, TSM和StNet是End-to-End的视频分类模型。Attention LSTM模型速度快精度高,NeXtVLAD是2nd-Youtube-8M比赛中最好的单模型, TSN是基于2D-CNN的经典解决方案,TSM是基于时序移位的简单高效视频时空建模方法,Non-local模型提出了视频非局部关联建模方法。Attention Cluster和StNet是百度自研模型,分别发表于CVPR2018和AAAI2019,是Kinetics600比赛第一名中使用到的模型。C-TCN动作定位模型也是百度自研,2018年ActivityNet比赛的夺冠方案。BSN模型采用自底向上的方法生成proposal
,为视频动作定位问题中proposal的生成提供高效的解决方案。BMN模型是百度自研模型,2019年ActivityNet夺冠方案
。
-
包含视频分类和动作定位方向的多个主流领先模型,其中Attention LSTM,Attention Cluster和NeXtVLAD是比较流行的特征序列模型,Non-local, TSN, TSM和StNet是End-to-End的视频分类模型。Attention LSTM模型速度快精度高,NeXtVLAD是2nd-Youtube-8M比赛中最好的单模型, TSN是基于2D-CNN的经典解决方案,TSM是基于时序移位的简单高效视频时空建模方法,Non-local模型提出了视频非局部关联建模方法。Attention Cluster和StNet是百度自研模型,分别发表于CVPR2018和AAAI2019,是Kinetics600比赛第一名中使用到的模型。C-TCN动作定位模型也是百度自研,2018年ActivityNet比赛的夺冠方案。BSN模型采用自底向上的方法生成proposal
,为视频动作定位问题中proposal的生成提供高效的解决方案。BMN模型是百度自研模型,2019年ActivityNet夺冠方案。ETS结合时序注意力机制构建网络,是视频生成文字描述的经典模型。TALL是利用多模态时序回归定位器对视频片段进行查找的模型
。
-
提供了适合视频分类和动作定位任务的通用骨架代码,用户可一键式高效配置模型完成训练和评测。
## 安装
在当前模型库运行样例代码需要PaddlePaddle Fluid v.1.
5.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本,请根据
[
安装文档
](
http://www.paddlepaddle.org/documentation/docs/zh/1.5
/beginners_guide/install/index_cn.html
)
中的说明来更新PaddlePaddle。
在当前模型库运行样例代码需要PaddlePaddle Fluid v.1.
6.0或以上的版本。如果你的运行环境中的PaddlePaddle低于此版本,请根据
[
安装文档
](
http://www.paddlepaddle.org/documentation/docs/zh/1.6
/beginners_guide/install/index_cn.html
)
中的说明来更新PaddlePaddle。
### 其他环境依赖
...
...
@@ -35,6 +37,10 @@
-
CUDNN >= 7.0
-
pandas
-
h5py
-
使用Youtube-8M数据集时,需要将tfrecord数据转化成pickle格式,需要用到Tensorflow,详见
[
数据说明
](
./data/dataset/README.md
)
中Youtube-8M部分。与此相关的模型是Attention Cluster, Attention LSTM, NeXtVLAD,使用其他模型请忽略此项。
-
使用Kinetics数据集时,如果需要将mp4文件提前解码并保存成pickle格式,需要用到ffmpeg,详见
[
数据说明
](
./data/dataset/README.md
)
中Kinetics部分。需要说明的是Nonlocal模型虽然也使用Kinetics数据集,但输入数据是视频源文件,不需要提前解码,不涉及此项。与此相关的模型是TSN, TSM, StNet,使用其他模型请忽略此项。
...
...
@@ -178,6 +184,17 @@ run.sh
| BSN | 16 | 1卡K40 | 7.0 | 66.64% (AUC) |
[
model-tem
](
https://paddlemodels.bj.bcebos.com/video_detection/BsnTem_final.pdparams
)
,
[
model-pem
](
https://paddlemodels.bj.bcebos.com/video_detection/BsnPem_final.pdparams
)
|
| BMN | 16 | 4卡K40 | 7.0 | 67.19% (AUC) |
[
model
](
https://paddlemodels.bj.bcebos.com/video_detection/BMN_final.pdparams
)
|
-
基于ActivityNet Captions的视频描述模型:
| 模型 | Batch Size | 环境配置 | cuDNN版本 | METEOR | 下载链接 |
| :-------: | :---: | :---------: | :----: | :----: | :----------: |
| ETS | 256 | 4卡P40 | 7.0 | 9.8 |
[
model
](
https://paddlemodels.bj.bcebos.com/video_caption/ETS_final.pdparams
)
|
-
基于TACoS的视频查找模型:
| 模型 | Batch Size | 环境配置 | cuDNN版本 | R1@IOU5 | R5@IOU5 | 下载链接 |
| :-------: | :---: | :---------: | :----: | :----: | :----: | :----------: |
| TALL | 56 | 1卡P40 | 7.2 | 0.13 | 0.24 |
[
model
](
https://paddlemodels.bj.bcebos.com/video_grounding/TALL_final.pdparams
)
|
## 参考文献
...
...
@@ -190,10 +207,12 @@ run.sh
-
[
Non-local Neural Networks
](
https://arxiv.org/abs/1711.07971v1
)
, Xiaolong Wang, Ross Girshick, Abhinav Gupta, Kaiming He
-
[
Bsn: Boundary sensitive network for temporal action proposal generation
](
http://arxiv.org/abs/1806.02964
)
, Tianwei Lin, Xu Zhao, Haisheng Su, Chongjing Wang, Ming Yang.
-
[
BMN: Boundary-Matching Network for Temporal Action Proposal Generation
](
https://arxiv.org/abs/1907.09702
)
, Tianwei Lin, Xiao Liu, Xin Li, Errui Ding, Shilei Wen.
-
[
Describing Videos by Exploiting Temporal Structure
](
https://arxiv.org/abs/1502.08029
)
.
-
[
TALL: Temporal Activity Localization via Language Query
](
https://arxiv.org/abs/1705.02101
)
.
## 版本更新
-
3/2019: 新增模型库,发布Attention Cluster,Attention LSTM,NeXtVLAD,StNet,TSN五个视频分类模型。
-
4/2019: 发布Non-local, TSM两个视频分类模型。
-
6/2019: 发布C-TCN视频动作定位模型;Non-local模型增加C2D ResNet101和I3D ResNet50骨干网络;NeXtVLAD、TSM模型速度和显存优化。
-
10/2019: 发布视频动作定位模型BSN, BMN;视频描述模型ETS;视频查找模型TALL。
PaddleCV/PaddleVideo/configs/bmn.yaml
浏览文件 @
5d166b57
...
...
@@ -7,7 +7,7 @@ MODEL:
num_sample
:
32
num_sample_perbin
:
3
anno_file
:
"
data/dataset/bmn/activitynet_1.3_annotations.json"
feat_path
:
'
/paddle/PaddleProject/data
/fix_feat_100'
feat_path
:
'
data/dataset/bmn
/fix_feat_100'
TRAIN
:
subset
:
"
train"
...
...
PaddleCV/PaddleVideo/configs/bsn_tem.yaml
浏览文件 @
5d166b57
...
...
@@ -8,7 +8,7 @@ MODEL:
num_sample
:
32
num_sample_perbin
:
3
anno_file
:
"
data/dataset/bmn/activitynet_1.3_annotations.json"
feat_path
:
'
/paddle/PaddleProject/data
/fix_feat_100'
feat_path
:
'
data/dataset/bmn
/fix_feat_100'
pgm_top_K_train
:
500
pgm_top_K
:
1000
pgm_threshold
:
0.5
...
...
PaddleCV/PaddleVideo/configs/ets.yaml
0 → 100644
浏览文件 @
5d166b57
MODEL
:
name
:
"
ETS"
feat_size
:
2048
fc_dim
:
1024
gru_hidden_dim
:
512
decoder_size
:
512
word_emb_dim
:
512
max_length
:
80
beam_size
:
3
START
:
"
<s>"
END
:
"
<e>"
UNK
:
"
<unk>"
feat_path
:
'
./data/dataset/ets/data_dict'
dict_file
:
'
./data/dataset/ets/dict.txt'
TRAIN
:
epoch
:
40
batch_size
:
256
l2_weight_decay
:
1e-4
clip_norm
:
5.0
num_threads
:
8
buffer_size
:
1024
filelist
:
'
./data/dataset/ets/train.list'
use_gpu
:
True
num_gpus
:
4
VALID
:
filelist
:
'
./data/dataset/ets/val.list'
batch_size
:
256
num_threads
:
8
buffer_size
:
1024
use_gpu
:
True
num_gpus
:
4
TEST
:
filelist
:
'
./data/dataset/ets/val.list'
batch_size
:
1
num_threads
:
1
buffer_size
:
1024
INFER
:
filelist
:
'
./data/dataset/ets/infer.list'
batch_size
:
1
num_threads
:
1
buffer_size
:
1024
PaddleCV/PaddleVideo/configs/tall.yaml
0 → 100644
浏览文件 @
5d166b57
MODEL
:
name
:
"
TALL"
visual_feature_dim
:
12288
sentence_embedding_size
:
4800
semantic_size
:
1024
hidden_size
:
1000
output_size
:
3
TRAIN
:
epoch
:
25
use_gpu
:
True
num_gpus
:
1
batch_size
:
56
off_size
:
2
clip_norm
:
5.0
learning_rate
:
1e-3
semantic_size
:
1024
feats_dimen
:
4096
context_num
:
1
context_size
:
128
sent_vec_dim
:
4800
sliding_clip_path
:
"
./data/dataset/tall/Interval64_128_256_512_overlap0.8_c3d_fc6/"
clip_sentvec
:
"
./data/dataset/tall/train_clip-sentvec.pkl"
movie_length_info
:
"
./data/dataset/tall/video_allframes_info.pkl"
VALID
:
use_gpu
:
True
num_gpus
:
1
batch_size
:
56
off_size
:
2
clip_norm
:
5.0
learning_rate
:
1e-3
semantic_size
:
1024
feats_dimen
:
4096
context_num
:
1
context_size
:
128
sent_vec_dim
:
4800
sliding_clip_path
:
"
./data/dataset/tall/Interval64_128_256_512_overlap0.8_c3d_fc6/"
clip_sentvec
:
"
./data/dataset/tall/train_clip-sentvec.pkl"
movie_length_info
:
"
./data/dataset/tall/video_allframes_info.pkl"
TEST
:
batch_size
:
1
feats_dimen
:
4096
context_num
:
1
context_size
:
128
sent_vec_dim
:
4800
semantic_size
:
4800
sliding_clip_path
:
"
./data/dataset/tall/Interval128_256_overlap0.8_c3d_fc6/"
clip_sentvec
:
"
./data/dataset/tall/test_clip-sentvec.pkl"
INFER
:
batch_size
:
1
feats_dimen
:
4096
context_num
:
1
context_size
:
128
sent_vec_dim
:
4800
semantic_size
:
4800
filelist
:
"
./data/dataset/tall/infer"
sliding_clip_path
:
"
./data/dataset/tall/infer/infer_feat"
clip_sentvec
:
"
./data/dataset/tall/infer/infer_clip-sen.pkl"
PaddleCV/PaddleVideo/data/dataset/bmn/README.md
浏览文件 @
5d166b57
...
...
@@ -8,8 +8,4 @@ BMN模型使用ActivityNet 1.3数据集,使用方法有如下两种方式:
方式二:
我们也在
[
百度网盘
](
https://pan.baidu.com/s/19GI3_-uZbd_XynUO6g-8YQ
)
和
[
谷歌云盘
](
https://drive.google.com/file/d/1ISemndlSDS2FtqQOKL0t3Cjj9yk2yznF/view?usp=sharing
)
提供了处理好的视频特征。若使用百度网盘下载,在解压前请使用如下命令:
cat zip_csv_mean_100.z* > csv_mean_100.zip
解压完成后,请相应修改configs/bmn.yaml文件中的特征路径feat
\_
path。
我们也提供了处理好的视频特征,请下载
[
bmn\_feat
](
https://paddlemodels.bj.bcebos.com/video_detection/bmn_feat.tar.gz
)
数据后解压,同时相应的修改configs/bmn.yaml文件中的特征路径feat
\_
path。
PaddleCV/PaddleVideo/data/dataset/ctcn/README.md
浏览文件 @
5d166b57
# C-TCN模型数据使用说明
C-TCN模型使用ActivityNet 1.3数据集,具体下载方法请参考官方
[
下载说明
](
http://activity-net.org/index.html
)
。在训练此模型时,需要先对mp4源文件抽取RGB和Flow特征,然后再用训练好的TSN模型提取出抽象的特征数据,并存储为pickle文件格式。我们
将会提供转化后的数据下载链接
。转化后的数据文件目录结构为:
C-TCN模型使用ActivityNet 1.3数据集,具体下载方法请参考官方
[
下载说明
](
http://activity-net.org/index.html
)
。在训练此模型时,需要先对mp4源文件抽取RGB和Flow特征,然后再用训练好的TSN模型提取出抽象的特征数据,并存储为pickle文件格式。我们
使用百度云提供转化后的数据
[
下载链接
](
https://paddlemodels.bj.bcebos.com/video_detection/CTCN_data.tar.gz
)
。转化后的数据文件目录结构为:
```
data
...
...
PaddleCV/PaddleVideo/data/dataset/ets/README.md
0 → 100644
浏览文件 @
5d166b57
# ETS模型数据使用说明
ETS模型使用ActivityNet Captions数据集,数据准备方法如下:
步骤一. 特征数据准备:
-
在
[
ActivityNet下载页面
](
http://activity-net.org/challenges/2019/tasks/anet_captioning.html
)
中,下载"Frame-level features"特征数据集(~89GB)。将下载好的resnet152i
\_
features
\_
activitynet
\_
5fps
\_
320x240.pkl数据文件存放在PaddleVideo/data/dataset/ets目录下;
-
运行PaddleVideo/data/dataset/ets/generate
\_
train
\_
pickle.py文件,将数据转化为pickle文件,便于内存载入。生成的数据存放在PaddleVideo/data/dataset/ets/feat
\_
data文件夹下。
步骤二. 标签及索引数据准备:
-
在
[
Dense-Captioning Events in Videos项目页面
](
http://cs.stanford.edu/people/ranjaykrishna/densevid/
)
,从dataset链接中下载captions文件夹,其中包含标签和索引的json文件。将captions文件夹存放在PaddleVideo/data/dataset/ets目录下;
-
按
[
数据评估
](
../../../metrics/ets\_metrics/README.md
)
步骤下载好coco-caption文件夹,并将其放置在PaddleVideo目录下;
-
python运行generate
\_
data.py文件,生成训练用的文本文件train.list和val.list。
步骤三. 生成infer数据:
-
完成前两个步骤后,python运行generate
\_
infer
\_
data.py文件可生成infer.list文件。
按如上步骤操作,最终PaddleVideo/data/dataset/ets的目录结构为:
```
ets
|
|----feat_data/
|----train.list
|----val.list
|----generate_train_pickle.py
|----generate_data.py
|----generate_infer_data.py
|----captions/
|----resnet152_features_activitynet_5fps_320x240.pkl (生成feat_data后可移除以节省磁盘空间)
```
PaddleCV/PaddleVideo/data/dataset/ets/generate_data.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import
os
import
json
import
sys
sys
.
path
.
insert
(
0
,
'../../../coco-caption'
)
from
pycocoevalcap.tokenizer.ptbtokenizer
import
PTBTokenizer
def
remove_nonascii
(
text
):
""" remove nonascii
"""
return
''
.
join
([
i
if
ord
(
i
)
<
128
else
' '
for
i
in
text
])
def
generate_dictionary
(
caption_file_path
):
index
=
0
input_dict
=
{}
# get all sentences
train_data
=
json
.
loads
(
open
(
os
.
path
.
join
(
\
caption_file_path
,
'train.json'
)).
read
())
for
vid
,
content
in
train_data
.
iteritems
():
sentences
=
content
[
'sentences'
]
for
s
in
sentences
:
input_dict
[
index
]
=
[{
'caption'
:
remove_nonascii
(
s
)}]
index
+=
1
# ptbtokenizer
tokenizer
=
PTBTokenizer
()
output_dict
=
tokenizer
.
tokenize
(
input_dict
)
# sort by word frequency
word_count_dict
=
{}
for
_
,
sentence
in
output_dict
.
iteritems
():
words
=
sentence
[
0
].
split
()
for
w
in
words
:
if
w
not
in
word_count_dict
:
word_count_dict
[
w
]
=
1
else
:
word_count_dict
[
w
]
+=
1
# output dictionary
with
open
(
'dict.txt'
,
'w'
)
as
f
:
f
.
write
(
'<s> -1
\n
'
)
f
.
write
(
'<e> -1
\n
'
)
f
.
write
(
'<unk> -1
\n
'
)
truncation
=
3
for
word
,
freq
in
sorted
(
word_count_dict
.
iteritems
(),
\
key
=
lambda
x
:
x
[
1
],
reverse
=
True
):
if
freq
>=
truncation
:
f
.
write
(
'%s %d
\n
'
%
(
word
,
freq
))
print
'Generate dictionary done ...'
def
generate_data_list
(
mode
,
caption_file_path
):
# get file name
if
mode
==
'train'
:
file_name
=
'train.json'
elif
mode
==
'val'
:
file_name
=
'val_1.json'
else
:
print
'Invalid mode:'
%
mode
sys
.
exit
()
# get timestamps and sentences
input_dict
=
{}
data
=
json
.
loads
(
open
(
os
.
path
.
join
(
\
caption_file_path
,
file_name
)).
read
())
for
vid
,
content
in
data
.
iteritems
():
sentences
=
content
[
'sentences'
]
timestamps
=
content
[
'timestamps'
]
for
t
,
s
in
zip
(
timestamps
,
sentences
):
dictkey
=
' '
.
join
([
vid
,
str
(
t
[
0
]),
str
(
t
[
1
])])
input_dict
[
dictkey
]
=
[{
'caption'
:
remove_nonascii
(
s
)}]
# ptbtokenizer
tokenizer
=
PTBTokenizer
()
output_dict
=
tokenizer
.
tokenize
(
input_dict
)
with
open
(
'%s.list'
%
mode
,
'wb'
)
as
f
:
for
id
,
sentence
in
output_dict
.
iteritems
():
try
:
f
.
write
(
'
\t
'
.
join
(
id
.
split
()
+
sentence
)
+
'
\n
'
)
except
:
pass
print
'Generate %s.list done ...'
%
mode
if
__name__
==
'__main__'
:
caption_file_path
=
'./captions/'
generate_dictionary
(
caption_file_path
)
generate_data_list
(
'train'
,
caption_file_path
)
generate_data_list
(
'val'
,
caption_file_path
)
PaddleCV/PaddleVideo/data/dataset/ets/generate_infer_data.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
if_train
=
open
(
'val.list'
)
f_lines
=
f_train
.
readlines
()
with
open
(
'infer.list'
,
'wb'
)
as
f
:
for
i
in
range
(
100
):
f
.
write
(
f_lines
[
i
])
PaddleCV/PaddleVideo/data/dataset/ets/generate_train_pickle.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import
pickle
import
os
import
multiprocessing
output_dir
=
'./feat_data'
if
not
os
.
path
.
exists
(
output_dir
):
os
.
makedirs
(
output_dir
)
fname
=
'resnet152_features_activitynet_5fps_320x240.pkl'
d
=
pickle
.
load
(
open
(
fname
))
def
save_file
(
filenames
,
process_id
):
count
=
0
for
key
in
filenames
:
pickle
.
dump
(
d
[
key
],
open
(
os
.
path
.
join
(
output_dir
,
key
),
'w'
))
count
+=
1
if
count
%
100
==
0
:
print
(
'# %d processed %d samples'
%
(
process_id
,
count
))
print
(
'# %d total processed %d samples'
%
(
process_id
,
count
))
total_keys
=
d
.
keys
()
num_threads
=
8
filelists
=
[
None
]
*
8
seg_nums
=
len
(
total_keys
)
//
8
p_list
=
[
None
]
*
8
for
i
in
range
(
8
):
if
i
==
7
:
filelists
[
i
]
=
total_keys
[
i
*
seg_nums
:]
else
:
filelists
[
i
]
=
total_keys
[
i
*
seg_nums
:(
i
+
1
)
*
seg_nums
]
p_list
[
i
]
=
multiprocessing
.
Process
(
target
=
save_file
,
args
=
(
filelists
[
i
],
i
))
p_list
[
i
].
start
()
PaddleCV/PaddleVideo/data/dataset/nonlocal/README.md
浏览文件 @
5d166b57
...
...
@@ -12,11 +12,11 @@
## 生成文件列表
打开generate
\_
list.sh,将其中的TRAIN
\_
DIR和VALID
\_
DIR修改成用户所保存的mp4文件路径,运行脚本
运行下面的代码即可生成trainlist.txt、vallist.txt和testlist.txt,
bash generate_list.sh
python generate_filelist.py ${TRAIN_DIR} ${VALID_DIR}
即可生成trainlist.txt、vallist.txt和testlist.txt
。
其中TRAIN
\_
DIR和VALID
\_
DIR分别是存放训练和验证数据集文件的路径。注意请确认
[
kinetics-400\_train.csv
](
https://github.com/activitynet/ActivityNet/tree/master/Crawler/Kinetics/data/kinetics-400_train.csv
)
已经下载到本地,不然运行generate
\_
filelist.py时会报错
。
另外,如果要观察模型推断的效果,可以复制testlist.txt生成inferlist.txt,
...
...
PaddleCV/PaddleVideo/data/dataset/nonlocal/generate_filelist.py
浏览文件 @
5d166b57
...
...
@@ -17,13 +17,14 @@ import sys
num_classes
=
400
replace_space_by_underliner
=
True
# whether to replace space by '_' in labels
fn
=
sys
.
argv
[
1
]
#'trainlist_download400.txt'
train_dir
=
sys
.
argv
[
2
]
#'/docker_mount/data/k400/Kinetics_trimmed_processed_train'
val_dir
=
sys
.
argv
[
3
]
#'/docker_mount/data/k400/Kinetics_trimmed_processed_val'
trainlist
=
sys
.
argv
[
4
]
#'trainlist.txt'
vallist
=
sys
.
argv
[
5
]
#'vallist.txt'
1
]
#e.g., '/docker_mount/data/k400/Kinetics_trimmed_processed_train'
val_dir
=
sys
.
argv
[
2
]
#e.g., '/docker_mount/data/k400/Kinetics_trimmed_processed_val'
fn
=
'kinetics-400_train.csv'
# this should be download first from ActivityNet
trainlist
=
'trainlist.txt'
vallist
=
'vallist.txt'
testlist
=
'testlist.txt'
fl
=
open
(
fn
).
readlines
()
fl
=
[
line
.
strip
()
for
line
in
fl
if
line
.
strip
()
!=
''
]
...
...
@@ -74,5 +75,26 @@ def generate_file(Faction_label_dict, Ftrain_dir, Ftrainlist, Fnum_classes):
trainlist_outfile
.
close
()
###### generate file list for training
generate_file
(
action_label_dict
,
train_dir
,
trainlist
,
num_classes
)
###### generate file list for validation
generate_file
(
action_label_dict
,
val_dir
,
vallist
,
num_classes
)
###### generate file list for evaluation
sampling_times
=
10
cropping_times
=
3
fl
=
open
(
vallist
).
readlines
()
fl
=
[
line
.
strip
()
for
line
in
fl
if
line
.
strip
()
!=
''
]
f_test
=
open
(
testlist
,
'w'
)
for
i
in
range
(
len
(
fl
)):
line
=
fl
[
i
].
split
(
' '
)
fn
=
line
[
0
]
label
=
line
[
1
]
for
j
in
range
(
sampling_times
):
for
k
in
range
(
cropping_times
):
test_item
=
fn
+
' '
+
str
(
i
)
+
' '
+
str
(
j
)
+
' '
+
str
(
k
)
+
'
\n
'
f_test
.
write
(
test_item
)
f_test
.
close
()
PaddleCV/PaddleVideo/data/dataset/nonlocal/generate_list.sh
已删除
100644 → 0
浏览文件 @
97aab0ef
# Download txt name
TRAINLIST_DOWNLOAD
=
"kinetics-400_train.csv"
# path of the train and valid data
TRAIN_DIR
=
YOUR_TRAIN_DATA_DIR
# replace this with your train data dir
VALID_DIR
=
YOUR_VALID_DATA_DIR
# replace this with your valid data dir
python generate_filelist.py
$TRAINLIST_DOWNLOAD
$TRAIN_DIR
$VALID_DIR
trainlist.txt vallist.txt
# generate test list
python generate_testlist_multicrop.py
PaddleCV/PaddleVideo/data/dataset/nonlocal/generate_testlist_multicrop.py
已删除
100644 → 0
浏览文件 @
97aab0ef
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
vallist
=
'vallist.txt'
testlist
=
'testlist.txt'
sampling_times
=
10
cropping_times
=
3
fl
=
open
(
vallist
).
readlines
()
fl
=
[
line
.
strip
()
for
line
in
fl
if
line
.
strip
()
!=
''
]
f_test
=
open
(
testlist
,
'w'
)
for
i
in
range
(
len
(
fl
)):
line
=
fl
[
i
].
split
(
' '
)
fn
=
line
[
0
]
label
=
line
[
1
]
for
j
in
range
(
sampling_times
):
for
k
in
range
(
cropping_times
):
test_item
=
fn
+
' '
+
str
(
i
)
+
' '
+
str
(
j
)
+
' '
+
str
(
k
)
+
'
\n
'
f_test
.
write
(
test_item
)
f_test
.
close
()
PaddleCV/PaddleVideo/data/dataset/tall/README.md
0 → 100644
浏览文件 @
5d166b57
# TALL模型数据使用说明
TALL模型使用TACoS数据集,数据准备过程如下:
步骤一. 训练和测试集:
-
训练和测试使用提取好的数据特征,请参考TALL模型原作者提供的
[
数据下载
](
https://github.com/jiyanggao/TALL
)
方法进行模型训练与评估;
步骤二. infer数据
-
为便于用户使用模型进行推断,我们提供了生成infer数据的文件./gen
\_
infer.py,执行完步骤一后python运行该文件便可在当前文件夹下生成infer数据。
按如上步骤操作,最终PaddleVideo/data/dataset/tall需要包含的文件有:
```
tall
|
|----Interval64_128_256_512_overlap0.8_c3d_fc6/
|----Interval128_256_overlap0.8_c3d_fc6/
|----train_clip-sentvec.pkl
|----test_clip-sentvec.pkl
|----video_allframes_info.pkl
|----infer
|
|----infer_feat/
|----infer_clip-sen.pkl
```
PaddleCV/PaddleVideo/data/dataset/tall/gen_infer.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
# select sentence vector and featmap of one movie name for inference
import
os
import
sys
import
pickle
import
numpy
as
np
infer_path
=
'infer'
infer_feat_path
=
'infer/infer_feat'
if
not
os
.
path
.
exists
(
infer_path
):
os
.
mkdir
(
infer_path
)
if
not
os
.
path
.
exists
(
infer_feat_path
):
os
.
mkdir
(
infer_feat_path
)
python_ver
=
sys
.
version_info
pickle_path
=
'test_clip-sentvec.pkl'
if
python_ver
<
(
3
,
0
):
movies_sentence
=
pickle
.
load
(
open
(
pickle_path
,
'rb'
))
else
:
movies_sentence
=
pickle
.
load
(
open
(
pickle_path
,
'rb'
),
encoding
=
'bytes'
)
select_name
=
movies_sentence
[
0
][
0
].
split
(
'.'
)[
0
]
res_sentence
=
[]
for
movie_sentence
in
movies_sentence
:
if
movie_sentence
[
0
].
split
(
'.'
)[
0
]
==
select_name
:
res_sen
=
[]
res_sen
.
append
(
movie_sentence
[
0
])
res_sen
.
append
([
movie_sentence
[
1
][
0
]])
#select the first one sentence
res_sentence
.
append
(
res_sen
)
file
=
open
(
'infer/infer_clip-sen.pkl'
,
'wb'
)
pickle
.
dump
(
res_sentence
,
file
,
protocol
=
2
)
movies_feat
=
os
.
listdir
(
'Interval128_256_overlap0.8_c3d_fc6'
)
for
movie_feat
in
movies_feat
:
if
movie_feat
.
split
(
'.'
)[
0
]
==
select_name
:
feat_path
=
os
.
path
.
join
(
'Interval128_256_overlap0.8_c3d_fc6'
,
movie_feat
)
feat
=
np
.
load
(
feat_path
)
np
.
save
(
os
.
path
.
join
(
infer_feat_path
,
movie_feat
),
feat
)
PaddleCV/PaddleVideo/eval.py
浏览文件 @
5d166b57
...
...
@@ -26,6 +26,7 @@ import models
from
reader
import
get_reader
from
metrics
import
get_metrics
from
utils.utility
import
check_cuda
from
utils.utility
import
check_version
logging
.
root
.
handlers
=
[]
FORMAT
=
'[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
...
...
@@ -83,7 +84,7 @@ def test(args):
# build model
test_model
=
models
.
get_model
(
args
.
model_name
,
test_config
,
mode
=
'test'
)
test_model
.
build_input
(
use_
pyre
ader
=
False
)
test_model
.
build_input
(
use_
datalo
ader
=
False
)
test_model
.
build_model
()
test_feeds
=
test_model
.
feeds
()
test_fetch_list
=
test_model
.
fetches
()
...
...
@@ -91,6 +92,8 @@ def test(args):
place
=
fluid
.
CUDAPlace
(
0
)
if
args
.
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
if
args
.
weights
:
assert
os
.
path
.
exists
(
args
.
weights
),
"Given weight dir {} not exist."
.
format
(
args
.
weights
)
...
...
@@ -110,8 +113,23 @@ def test(args):
epoch_period
=
[]
for
test_iter
,
data
in
enumerate
(
test_reader
()):
cur_time
=
time
.
time
()
test_outs
=
exe
.
run
(
fetch_list
=
test_fetch_list
,
feed
=
test_feeder
.
feed
(
data
))
if
args
.
model_name
==
'ETS'
:
feat_data
=
[
items
[:
3
]
for
items
in
data
]
vinfo
=
[
items
[
3
:]
for
items
in
data
]
test_outs
=
exe
.
run
(
fetch_list
=
test_fetch_list
,
feed
=
test_feeder
.
feed
(
feat_data
),
return_numpy
=
False
)
test_outs
+=
[
vinfo
]
elif
args
.
model_name
==
'TALL'
:
feat_data
=
[
items
[:
2
]
for
items
in
data
]
vinfo
=
[
items
[
2
:]
for
items
in
data
]
test_outs
=
exe
.
run
(
fetch_list
=
test_fetch_list
,
feed
=
test_feeder
.
feed
(
feat_data
),
return_numpy
=
True
)
test_outs
+=
[
vinfo
]
else
:
test_outs
=
exe
.
run
(
fetch_list
=
test_fetch_list
,
feed
=
test_feeder
.
feed
(
data
))
period
=
time
.
time
()
-
cur_time
epoch_period
.
append
(
period
)
test_metrics
.
accumulate
(
test_outs
)
...
...
@@ -130,6 +148,7 @@ if __name__ == "__main__":
args
=
parse_args
()
# check whether the installed paddle is compiled with GPU
check_cuda
(
args
.
use_gpu
)
check_version
()
logger
.
info
(
args
)
test
(
args
)
PaddleCV/PaddleVideo/inference_model.py
浏览文件 @
5d166b57
...
...
@@ -80,7 +80,7 @@ def save_inference_model(args):
infer_config
=
merge_configs
(
config
,
'infer'
,
vars
(
args
))
print_configs
(
infer_config
,
"Infer"
)
infer_model
=
models
.
get_model
(
args
.
model_name
,
infer_config
,
mode
=
'infer'
)
infer_model
.
build_input
(
use_
pyre
ader
=
False
)
infer_model
.
build_input
(
use_
datalo
ader
=
False
)
infer_model
.
build_model
()
infer_feeds
=
infer_model
.
feeds
()
infer_outputs
=
infer_model
.
outputs
()
...
...
PaddleCV/PaddleVideo/metrics/bmn_metrics/README.md
浏览文件 @
5d166b57
...
...
@@ -3,7 +3,9 @@
-
ActivityNet数据集的具体使用说明可以参考其
[
官方网站
](
http://activity-net.org
)
-
下载指标评估代码,请从
[
ActivityNet Gitub repository
](
https://github.com/activitynet/ActivityNet.git
)
下载,将Evaluation文件夹拷贝至PaddleVideo目录下。
-
下载指标评估代码,请从
[
ActivityNet Gitub repository
](
https://github.com/activitynet/ActivityNet.git
)
下载,将Evaluation文件夹拷贝至PaddleVideo目录下。(注:若使用python3,print函数需要添加括号,请对Evaluation目录下的.py文件做相应修改。)
-
请下载
[
activity\_net\_1\_3\_new.json
](
https://paddlemodels.bj.bcebos.com/video_detection/activity_net_1_3_new.json
)
文件,并将其放置在PaddleVideo/Evaluation/data目录下,相较于原始的activity
\_
net.v1-3.min.json文件,我们过滤了其中一些失效的视频条目。
-
计算精度指标
...
...
PaddleCV/PaddleVideo/metrics/bmn_metrics/bmn_proposal_metrics.py
浏览文件 @
5d166b57
...
...
@@ -97,7 +97,7 @@ class MetricsCalculator():
os
.
path
.
join
(
self
.
output_path
,
"%s.csv"
%
video_name
),
index
=
False
)
def
accumulate
(
self
,
fetch_list
):
cur_batch_size
=
1
# iteration counter
cur_batch_size
=
1
# iteration counter
,for test and inference, batch_size=1
total_loss
=
fetch_list
[
0
]
tem_loss
=
fetch_list
[
1
]
pem_reg_loss
=
fetch_list
[
2
]
...
...
PaddleCV/PaddleVideo/metrics/bsn_metrics/bsn_pem_metrics.py
浏览文件 @
5d166b57
...
...
@@ -61,13 +61,15 @@ class MetricsCalculator():
os
.
makedirs
(
self
.
output_path_pem
)
def
save_results
(
self
,
pred_iou
,
props_info
,
fid
):
video_name
=
self
.
video_list
[
fid
]
if
self
.
mode
==
'infer'
:
video_name
=
self
.
video_list
[
fid
[
0
]]
else
:
video_name
=
self
.
video_list
[
fid
[
0
][
0
]]
df
=
pd
.
DataFrame
()
df
[
"xmin"
]
=
props_info
[
0
,
0
,
:
]
df
[
"xmax"
]
=
props_info
[
0
,
1
,
:
]
df
[
"xmin_score"
]
=
props_info
[
0
,
2
,
:
]
df
[
"xmax_score"
]
=
props_info
[
0
,
3
,
:
]
df
[
"xmin"
]
=
props_info
[
0
,
:,
0
]
df
[
"xmax"
]
=
props_info
[
0
,
:,
1
]
df
[
"xmin_score"
]
=
props_info
[
0
,
:,
2
]
df
[
"xmax_score"
]
=
props_info
[
0
,
:,
3
]
df
[
"iou_score"
]
=
pred_iou
.
squeeze
()
df
.
to_csv
(
os
.
path
.
join
(
self
.
output_path_pem
,
video_name
+
".csv"
),
...
...
@@ -83,13 +85,13 @@ class MetricsCalculator():
if
self
.
mode
==
'test'
:
pred_iou
=
np
.
array
(
fetch_list
[
1
])
props_info
=
np
.
array
(
fetch_list
[
2
])
fid
=
fetch_list
[
3
][
0
][
0
]
fid
=
np
.
array
(
fetch_list
[
3
])
self
.
save_results
(
pred_iou
,
props_info
,
fid
)
def
accumulate_infer_results
(
self
,
fetch_list
):
pred_iou
=
np
.
array
(
fetch_list
[
0
])
props_info
=
np
.
array
(
fetch_list
[
1
])
fid
=
fetch_list
[
2
][
0
]
props_info
=
np
.
array
(
[
item
[
0
]
for
item
in
fetch_list
[
1
]
])
fid
=
[
item
[
1
]
for
item
in
fetch_list
[
1
]
]
self
.
save_results
(
pred_iou
,
props_info
,
fid
)
def
finalize_metrics
(
self
):
...
...
PaddleCV/PaddleVideo/metrics/bsn_metrics/bsn_tem_metrics.py
浏览文件 @
5d166b57
...
...
@@ -39,7 +39,6 @@ class MetricsCalculator():
1.0
/
self
.
tscale
*
i
for
i
in
range
(
1
,
self
.
tscale
+
1
)
]
if
self
.
mode
==
"test"
or
self
.
mode
==
"infer"
:
print
(
'1212'
)
self
.
output_path_tem
=
cfg
[
self
.
mode
.
upper
()][
"output_path_tem"
]
self
.
output_path_pgm_feature
=
cfg
[
self
.
mode
.
upper
()][
"output_path_pgm_feature"
]
...
...
@@ -101,15 +100,21 @@ class MetricsCalculator():
os
.
makedirs
(
self
.
output_path_pgm_proposal
)
def
save_results
(
self
,
pred_tem
,
fid
):
video_name
=
self
.
video_list
[
fid
]
pred_start
=
pred_tem
[
0
,
0
,
:]
pred_end
=
pred_tem
[
0
,
1
,
:]
pred_action
=
pred_tem
[
0
,
2
,
:]
output_tem
=
np
.
stack
([
pred_start
,
pred_end
,
pred_action
],
axis
=
1
)
video_df
=
pd
.
DataFrame
(
output_tem
,
columns
=
[
"start"
,
"end"
,
"action"
])
video_df
.
to_csv
(
os
.
path
.
join
(
self
.
output_path_tem
,
video_name
+
".csv"
),
index
=
False
)
batch_size
=
pred_tem
.
shape
[
0
]
for
i
in
range
(
batch_size
):
if
self
.
mode
==
'test'
:
video_name
=
self
.
video_list
[
fid
[
i
][
0
]]
elif
self
.
mode
==
'infer'
:
video_name
=
self
.
video_list
[
fid
[
i
]]
pred_start
=
pred_tem
[
i
,
0
,
:]
pred_end
=
pred_tem
[
i
,
1
,
:]
pred_action
=
pred_tem
[
i
,
2
,
:]
output_tem
=
np
.
stack
([
pred_start
,
pred_end
,
pred_action
],
axis
=
1
)
video_df
=
pd
.
DataFrame
(
output_tem
,
columns
=
[
"start"
,
"end"
,
"action"
])
video_df
.
to_csv
(
os
.
path
.
join
(
self
.
output_path_tem
,
video_name
+
".csv"
),
index
=
False
)
def
accumulate
(
self
,
fetch_list
):
cur_batch_size
=
1
# iteration counter
...
...
@@ -126,12 +131,12 @@ class MetricsCalculator():
if
self
.
mode
==
'test'
:
pred_tem
=
np
.
array
(
fetch_list
[
4
])
fid
=
fetch_list
[
5
]
[
0
][
0
]
fid
=
fetch_list
[
5
]
self
.
save_results
(
pred_tem
,
fid
)
def
accumulate_infer_results
(
self
,
fetch_list
):
pred_tem
=
np
.
array
(
fetch_list
[
0
])
fid
=
fetch_list
[
1
]
[
0
]
fid
=
fetch_list
[
1
]
self
.
save_results
(
pred_tem
,
fid
)
def
finalize_metrics
(
self
):
...
...
PaddleCV/PaddleVideo/metrics/detections/detection_metrics.py
浏览文件 @
5d166b57
...
...
@@ -128,7 +128,6 @@ class MetricsCalculator():
def
accumulate_infer_results
(
self
,
fetch_list
):
fname
=
fetch_list
[
2
][
0
]
print
(
'fname'
,
fetch_list
[
2
])
loc_pred
=
np
.
array
(
fetch_list
[
0
])
cls_pred
=
np
.
array
(
fetch_list
[
1
])
assert
len
(
loc_pred
)
==
1
,
"please set batchsize to be 1 when infer"
...
...
PaddleCV/PaddleVideo/metrics/ets_metrics/README.md
0 → 100644
浏览文件 @
5d166b57
## ActivityNet Captions 指标计算
-
ActivityNet Captions的指标评估代码可以参考
[
官方网站
](
https://github.com/ranjaykrishna/densevid_eval
)
-
下载指标评估代码,将coco-caption和evaluate.py拷贝到PaddleVideo下;
-
计算精度指标,python运行evaluate.py文件,可通过-s参数指定结果文件,-r参数修改标签文件;
-
由于模型计算波动较大,在评估过程中可以取不同Epoch的所得训练模型计算精度指标,最优的METEOR值约为10.0左右。
PaddleCV/PaddleVideo/metrics/ets_metrics/__init__.py
0 → 100644
浏览文件 @
5d166b57
PaddleCV/PaddleVideo/metrics/ets_metrics/ets_metrics.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
import
numpy
as
np
import
datetime
import
logging
import
json
import
os
from
models.ctcn.ctcn_utils
import
BoxCoder
logger
=
logging
.
getLogger
(
__name__
)
class
MetricsCalculator
():
def
__init__
(
self
,
name
=
'ETS'
,
mode
=
'train'
,
dict_file
=
''
):
self
.
name
=
name
self
.
mode
=
mode
# 'train', 'valid', 'test', 'infer'
self
.
dict_file
=
dict_file
self
.
reset
()
def
reset
(
self
):
logger
.
info
(
'Resetting {} metrics...'
.
format
(
self
.
mode
))
self
.
aggr_batch_size
=
0
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
self
.
aggr_loss
=
0.0
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
self
.
result_dict
=
dict
()
self
.
out_file
=
self
.
name
+
'_'
+
self
.
mode
+
'_res_'
+
'.json'
def
accumulate
(
self
,
fetch_list
):
if
self
.
mode
==
'valid'
:
loss
=
fetch_list
[
0
]
self
.
aggr_loss
+=
np
.
mean
(
np
.
array
(
loss
))
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
seq_ids
=
fetch_list
[
0
]
seq_scores
=
fetch_list
[
1
]
b_vid
=
[
item
[
0
]
for
item
in
fetch_list
[
2
]]
b_stime
=
[
item
[
1
]
for
item
in
fetch_list
[
2
]]
b_etime
=
[
item
[
2
]
for
item
in
fetch_list
[
2
]]
# for test and inference, batch size=1
vid
=
b_vid
[
0
]
stime
=
b_stime
[
0
]
etime
=
b_etime
[
0
]
#get idx_to_word
self
.
idx_to_word
=
dict
()
with
open
(
self
.
dict_file
,
'r'
)
as
f
:
for
i
,
line
in
enumerate
(
f
):
self
.
idx_to_word
[
i
]
=
line
.
strip
().
split
()[
0
]
for
i
in
range
(
len
(
seq_ids
.
lod
()[
0
])
-
1
):
start
=
seq_ids
.
lod
()[
0
][
i
]
end
=
seq_ids
.
lod
()[
0
][
i
+
1
]
for
j
in
range
(
end
-
start
)[:
1
]:
sub_start
=
seq_ids
.
lod
()[
1
][
start
+
j
]
sub_end
=
seq_ids
.
lod
()[
1
][
start
+
j
+
1
]
sent
=
" "
.
join
([
self
.
idx_to_word
[
idx
]
for
idx
in
np
.
array
(
seq_ids
)[
sub_start
:
sub_end
][
1
:
-
1
]
])
if
vid
not
in
self
.
result_dict
:
self
.
result_dict
[
vid
]
=
[{
'timestamp'
:
[
stime
,
etime
],
'sentence'
:
sent
}]
else
:
self
.
result_dict
[
vid
].
append
({
'timestamp'
:
[
stime
,
etime
],
'sentence'
:
sent
})
def
accumulate_infer_results
(
self
,
fetch_list
):
# the same as test
pass
def
finalize_metrics
(
self
,
savedir
):
self
.
filepath
=
os
.
path
.
join
(
savedir
,
self
.
out_file
)
with
open
(
self
.
filepath
,
'w'
)
as
f
:
f
.
write
(
json
.
dumps
(
{
'version'
:
'VERSION 1.0'
,
'results'
:
self
.
result_dict
,
'external_data'
:
{}
},
indent
=
2
))
logger
.
info
(
'results has been saved into file: {}'
.
format
(
self
.
filepath
))
def
finalize_infer_metrics
(
self
,
savedir
):
# the same as test
pass
def
get_computed_metrics
(
self
):
pass
PaddleCV/PaddleVideo/metrics/metrics_util.py
浏览文件 @
5d166b57
...
...
@@ -28,6 +28,8 @@ from metrics.detections import detection_metrics as detection_metrics
from
metrics.bmn_metrics
import
bmn_proposal_metrics
as
bmn_proposal_metrics
from
metrics.bsn_metrics
import
bsn_tem_metrics
as
bsn_tem_metrics
from
metrics.bsn_metrics
import
bsn_pem_metrics
as
bsn_pem_metrics
from
metrics.ets_metrics
import
ets_metrics
as
ets_metrics
from
metrics.tall_metrics
import
tall_metrics
as
tall_metrics
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -421,6 +423,85 @@ class BsnPemMetrics(Metrics):
self
.
calculator
.
reset
()
class
ETSMetrics
(
Metrics
):
def
__init__
(
self
,
name
,
mode
,
cfg
):
self
.
name
=
name
self
.
mode
=
mode
args
=
{}
args
[
'dict_file'
]
=
cfg
.
MODEL
.
dict_file
args
[
'mode'
]
=
mode
args
[
'name'
]
=
name
self
.
calculator
=
ets_metrics
.
MetricsCalculator
(
**
args
)
def
calculate_and_log_out
(
self
,
fetch_list
,
info
=
''
):
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
loss
=
np
.
array
(
fetch_list
[
0
])
logger
.
info
(
info
+
'
\t
Loss = {}'
.
format
(
'%.08f'
%
np
.
mean
(
loss
)))
elif
self
.
mode
==
"test"
:
translation_ids
=
np
.
array
(
fetch_list
[
0
])
translation_scores
=
np
.
array
(
fetch_list
[
1
])
logger
.
info
(
info
+
'
\t
translation_ids = {},
\t
translation_scores = {}'
.
format
(
'%.01f'
%
np
.
mean
(
translation_ids
),
'%.04f'
%
np
.
mean
(
translation_scores
)))
def
accumulate
(
self
,
fetch_list
):
self
.
calculator
.
accumulate
(
fetch_list
)
def
finalize_and_log_out
(
self
,
info
=
''
,
savedir
=
'./'
):
if
self
.
mode
==
'valid'
:
logger
.
info
(
info
)
else
:
#test or infer
self
.
calculator
.
finalize_metrics
(
savedir
)
if
self
.
mode
==
'test'
:
logger
.
info
(
info
+
'please refer to metrics/ets_metrics/README.md to get accuracy'
)
def
reset
(
self
):
self
.
calculator
.
reset
()
class
TALLMetrics
(
Metrics
):
def
__init__
(
self
,
name
,
mode
,
cfg
):
self
.
name
=
name
self
.
mode
=
mode
args
=
{}
args
[
'mode'
]
=
mode
args
[
'name'
]
=
name
self
.
calculator
=
tall_metrics
.
MetricsCalculator
(
**
args
)
def
calculate_and_log_out
(
self
,
fetch_list
,
info
=
''
):
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
loss
=
np
.
array
(
fetch_list
[
0
])
logger
.
info
(
info
+
'
\t
Loss = {}'
.
format
(
'%.04f'
%
np
.
mean
(
loss
)))
elif
self
.
mode
==
"test"
:
sim_score_mat
=
np
.
array
(
fetch_list
[
0
])
logger
.
info
(
info
+
'
\t
sim_score_mat = {}'
.
format
(
'%.01f'
%
np
.
mean
(
sim_score_mat
)))
def
accumulate
(
self
,
fetch_list
):
self
.
calculator
.
accumulate
(
fetch_list
)
def
finalize_and_log_out
(
self
,
info
=
''
,
savedir
=
'./'
):
if
self
.
mode
==
'valid'
:
logger
.
info
(
info
)
elif
self
.
mode
==
'infer'
:
self
.
calculator
.
finalize_infer_metrics
(
savedir
)
else
:
self
.
calculator
.
finalize_metrics
(
savedir
)
metrics_dict
=
self
.
calculator
.
get_computed_metrics
()
R1_IOU5
=
metrics_dict
[
'best_R1_IOU5'
]
R5_IOU5
=
metrics_dict
[
'best_R5_IOU5'
]
logger
.
info
(
"best_R1_IOU5: {}
\n
"
.
format
(
" %0.3f"
%
R1_IOU5
))
logger
.
info
(
"best_R5_IOU5: {}
\n
"
.
format
(
" %0.3f"
%
R5_IOU5
))
def
reset
(
self
):
self
.
calculator
.
reset
()
class
MetricsZoo
(
object
):
def
__init__
(
self
):
self
.
metrics_zoo
=
{}
...
...
@@ -461,3 +542,5 @@ regist_metrics("CTCN", DetectionMetrics)
regist_metrics
(
"BMN"
,
BmnMetrics
)
regist_metrics
(
"BSNTEM"
,
BsnTemMetrics
)
regist_metrics
(
"BSNPEM"
,
BsnPemMetrics
)
regist_metrics
(
"ETS"
,
ETSMetrics
)
regist_metrics
(
"TALL"
,
TALLMetrics
)
PaddleCV/PaddleVideo/metrics/tall_metrics/__init__.py
0 → 100644
浏览文件 @
5d166b57
PaddleCV/PaddleVideo/metrics/tall_metrics/tall_metrics.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
import
numpy
as
np
import
datetime
import
logging
import
json
import
os
import
operator
logger
=
logging
.
getLogger
(
__name__
)
class
MetricsCalculator
():
def
__init__
(
self
,
name
=
'TALL'
,
mode
=
'train'
,
):
self
.
name
=
name
self
.
mode
=
mode
# 'train', 'valid', 'test', 'infer'
self
.
reset
()
def
reset
(
self
):
logger
.
info
(
'Resetting {} metrics...'
.
format
(
self
.
mode
))
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
self
.
aggr_loss
=
0.0
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
self
.
result_dict
=
dict
()
self
.
save_res
=
dict
()
self
.
out_file
=
self
.
name
+
'_'
+
self
.
mode
+
'_res_'
+
'.json'
def
nms_temporal
(
self
,
x1
,
x2
,
sim
,
overlap
):
pick
=
[]
assert
len
(
x1
)
==
len
(
sim
)
assert
len
(
x2
)
==
len
(
sim
)
if
len
(
x1
)
==
0
:
return
pick
union
=
list
(
map
(
operator
.
sub
,
x2
,
x1
))
# union = x2-x1
I
=
[
i
[
0
]
for
i
in
sorted
(
enumerate
(
sim
),
key
=
lambda
x
:
x
[
1
])]
# sort and get index
while
len
(
I
)
>
0
:
i
=
I
[
-
1
]
pick
.
append
(
i
)
xx1
=
[
max
(
x1
[
i
],
x1
[
j
])
for
j
in
I
[:
-
1
]]
xx2
=
[
min
(
x2
[
i
],
x2
[
j
])
for
j
in
I
[:
-
1
]]
inter
=
[
max
(
0.0
,
k2
-
k1
)
for
k1
,
k2
in
zip
(
xx1
,
xx2
)]
o
=
[
inter
[
u
]
/
(
union
[
i
]
+
union
[
I
[
u
]]
-
inter
[
u
])
for
u
in
range
(
len
(
I
)
-
1
)
]
I_new
=
[]
for
j
in
range
(
len
(
o
)):
if
o
[
j
]
<=
overlap
:
I_new
.
append
(
I
[
j
])
I
=
I_new
return
pick
def
calculate_IoU
(
self
,
i0
,
i1
):
# calculate temporal intersection over union
union
=
(
min
(
i0
[
0
],
i1
[
0
]),
max
(
i0
[
1
],
i1
[
1
]))
inter
=
(
max
(
i0
[
0
],
i1
[
0
]),
min
(
i0
[
1
],
i1
[
1
]))
iou
=
1.0
*
(
inter
[
1
]
-
inter
[
0
])
/
(
union
[
1
]
-
union
[
0
])
return
iou
def
compute_IoU_recall_top_n_forreg
(
self
,
top_n
,
iou_thresh
,
sentence_image_mat
,
sentence_image_reg_mat
,
sclips
):
correct_num
=
0.0
for
k
in
range
(
sentence_image_mat
.
shape
[
0
]):
gt
=
sclips
[
k
]
gt_start
=
float
(
gt
.
split
(
"_"
)[
1
])
gt_end
=
float
(
gt
.
split
(
"_"
)[
2
])
sim_v
=
[
v
for
v
in
sentence_image_mat
[
k
]]
starts
=
[
s
for
s
in
sentence_image_reg_mat
[
k
,
:,
0
]]
ends
=
[
e
for
e
in
sentence_image_reg_mat
[
k
,
:,
1
]]
picks
=
self
.
nms_temporal
(
starts
,
ends
,
sim_v
,
iou_thresh
-
0.05
)
if
top_n
<
len
(
picks
):
picks
=
picks
[
0
:
top_n
]
for
index
in
picks
:
pred_start
=
sentence_image_reg_mat
[
k
,
index
,
0
]
pred_end
=
sentence_image_reg_mat
[
k
,
index
,
1
]
iou
=
self
.
calculate_IoU
((
gt_start
,
gt_end
),
(
pred_start
,
pred_end
))
if
iou
>=
iou_thresh
:
correct_num
+=
1
break
return
correct_num
def
accumulate
(
self
,
fetch_list
):
if
self
.
mode
==
'valid'
:
loss
=
fetch_list
[
0
]
self
.
aggr_loss
+=
np
.
mean
(
np
.
array
(
loss
))
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
outputs
=
fetch_list
[
0
]
b_start
=
[
item
[
0
]
for
item
in
fetch_list
[
1
]]
b_end
=
[
item
[
1
]
for
item
in
fetch_list
[
1
]]
b_k
=
[
item
[
2
]
for
item
in
fetch_list
[
1
]]
b_t
=
[
item
[
3
]
for
item
in
fetch_list
[
1
]]
b_movie_clip_sentences
=
[
item
[
4
]
for
item
in
fetch_list
[
1
]]
b_movie_clip_featmaps
=
[
item
[
5
]
for
item
in
fetch_list
[
1
]]
b_movie_name
=
[
item
[
6
]
for
item
in
fetch_list
[
1
]]
batch_size
=
len
(
b_start
)
for
i
in
range
(
batch_size
):
start
=
b_start
[
i
]
end
=
b_end
[
i
]
k
=
b_k
[
i
]
t
=
b_t
[
i
]
movie_clip_sentences
=
b_movie_clip_sentences
[
i
]
movie_clip_featmaps
=
b_movie_clip_featmaps
[
i
]
movie_name
=
b_movie_name
[
i
]
item_res
=
[
outputs
,
start
,
end
,
k
,
t
]
if
movie_name
not
in
self
.
result_dict
.
keys
():
self
.
result_dict
[
movie_name
]
=
[]
self
.
result_dict
[
movie_name
].
append
(
movie_clip_sentences
)
self
.
result_dict
[
movie_name
].
append
(
movie_clip_featmaps
)
self
.
result_dict
[
movie_name
].
append
(
item_res
)
def
accumulate_infer_results
(
self
,
fetch_list
):
# the same as test
pass
def
finalize_metrics
(
self
,
savedir
):
# init
IoU_thresh
=
[
0.1
,
0.3
,
0.5
,
0.7
]
all_correct_num_10
=
[
0.0
]
*
5
all_correct_num_5
=
[
0.0
]
*
5
all_correct_num_1
=
[
0.0
]
*
5
all_retrievd
=
0.0
idx
=
0
all_number
=
len
(
self
.
result_dict
)
for
movie_name
in
self
.
result_dict
.
keys
():
idx
+=
1
logger
.
info
(
'{} / {}'
.
format
(
'%d'
%
idx
,
'%d'
%
all_number
))
movie_clip_sentences
=
self
.
result_dict
[
movie_name
][
0
]
movie_clip_featmaps
=
self
.
result_dict
[
movie_name
][
1
]
ls
=
len
(
movie_clip_sentences
)
lf
=
len
(
movie_clip_featmaps
)
sentence_image_mat
=
np
.
zeros
([
ls
,
lf
])
sentence_image_reg_mat
=
np
.
zeros
([
ls
,
lf
,
2
])
movie_res
=
self
.
result_dict
[
movie_name
][
2
:]
for
item_res
in
movie_res
:
outputs
,
start
,
end
,
k
,
t
=
item_res
outputs
=
np
.
squeeze
(
outputs
)
sentence_image_mat
[
k
,
t
]
=
outputs
[
0
]
reg_end
=
end
+
outputs
[
2
]
reg_start
=
start
+
outputs
[
1
]
sentence_image_reg_mat
[
k
,
t
,
0
]
=
reg_start
sentence_image_reg_mat
[
k
,
t
,
1
]
=
reg_end
sclips
=
[
b
[
0
]
for
b
in
movie_clip_sentences
]
for
i
in
range
(
len
(
IoU_thresh
)):
IoU
=
IoU_thresh
[
i
]
correct_num_10
=
self
.
compute_IoU_recall_top_n_forreg
(
10
,
IoU
,
sentence_image_mat
,
sentence_image_reg_mat
,
sclips
)
correct_num_5
=
self
.
compute_IoU_recall_top_n_forreg
(
5
,
IoU
,
sentence_image_mat
,
sentence_image_reg_mat
,
sclips
)
correct_num_1
=
self
.
compute_IoU_recall_top_n_forreg
(
1
,
IoU
,
sentence_image_mat
,
sentence_image_reg_mat
,
sclips
)
logger
.
info
(
movie_name
+
" IoU= {}, R@10: {}; IoU= {}, R@5: {}; IoU= {}, R@1: {}"
.
format
(
'%s'
%
str
(
IoU
),
'%s'
%
str
(
correct_num_10
/
len
(
sclips
)),
'%s'
%
str
(
IoU
),
'%s'
%
str
(
correct_num_5
/
len
(
sclips
)),
'%s'
%
str
(
IoU
),
'%s'
%
str
(
correct_num_1
/
len
(
sclips
))))
all_correct_num_10
[
i
]
+=
correct_num_10
all_correct_num_5
[
i
]
+=
correct_num_5
all_correct_num_1
[
i
]
+=
correct_num_1
all_retrievd
+=
len
(
sclips
)
for
j
in
range
(
len
(
IoU_thresh
)):
logger
.
info
(
" IoU= {}, R@10: {}; IoU= {}, R@5: {}; IoU= {}, R@1: {}"
.
format
(
'%s'
%
str
(
IoU_thresh
[
j
]),
'%s'
%
str
(
all_correct_num_10
[
j
]
/
all_retrievd
),
'%s'
%
str
(
IoU_thresh
[
j
]),
'%s'
%
str
(
all_correct_num_5
[
j
]
/
all_retrievd
),
'%s'
%
str
(
IoU_thresh
[
j
]),
'%s'
%
str
(
all_correct_num_1
[
j
]
/
all_retrievd
)))
self
.
R1_IOU5
=
all_correct_num_1
[
2
]
/
all_retrievd
self
.
R5_IOU5
=
all_correct_num_5
[
2
]
/
all_retrievd
self
.
save_res
[
"best_R1_IOU5"
]
=
self
.
R1_IOU5
self
.
save_res
[
"best_R5_IOU5"
]
=
self
.
R5_IOU5
self
.
filepath
=
os
.
path
.
join
(
savedir
,
self
.
out_file
)
with
open
(
self
.
filepath
,
'w'
)
as
f
:
f
.
write
(
json
.
dumps
(
{
'version'
:
'VERSION 1.0'
,
'results'
:
self
.
save_res
,
'external_data'
:
{}
},
indent
=
2
))
logger
.
info
(
'results has been saved into file: {}'
.
format
(
self
.
filepath
))
def
finalize_infer_metrics
(
self
,
savedir
):
idx
=
0
all_number
=
len
(
self
.
result_dict
)
res
=
dict
()
for
movie_name
in
self
.
result_dict
.
keys
():
res
[
movie_name
]
=
[]
idx
+=
1
logger
.
info
(
'{} / {}'
.
format
(
'%d'
%
idx
,
'%d'
%
all_number
))
movie_clip_sentences
=
self
.
result_dict
[
movie_name
][
0
]
movie_clip_featmaps
=
self
.
result_dict
[
movie_name
][
1
]
ls
=
len
(
movie_clip_sentences
)
lf
=
len
(
movie_clip_featmaps
)
sentence_image_mat
=
np
.
zeros
([
ls
,
lf
])
sentence_image_reg_mat
=
np
.
zeros
([
ls
,
lf
,
2
])
movie_res
=
self
.
result_dict
[
movie_name
][
2
:]
for
item_res
in
movie_res
:
outputs
,
start
,
end
,
k
,
t
=
item_res
outputs
=
np
.
squeeze
(
outputs
)
sentence_image_mat
[
k
,
t
]
=
outputs
[
0
]
reg_end
=
end
+
outputs
[
2
]
reg_start
=
start
+
outputs
[
1
]
sentence_image_reg_mat
[
k
,
t
,
0
]
=
reg_start
sentence_image_reg_mat
[
k
,
t
,
1
]
=
reg_end
sclips
=
[
b
[
0
]
for
b
in
movie_clip_sentences
]
IoU
=
0.5
#pre-define
for
k
in
range
(
sentence_image_mat
.
shape
[
0
]):
#ground_truth for compare
gt
=
sclips
[
k
]
gt_start
=
float
(
gt
.
split
(
"_"
)[
1
])
gt_end
=
float
(
gt
.
split
(
"_"
)[
2
])
sim_v
=
[
v
for
v
in
sentence_image_mat
[
k
]]
starts
=
[
s
for
s
in
sentence_image_reg_mat
[
k
,
:,
0
]]
ends
=
[
e
for
e
in
sentence_image_reg_mat
[
k
,
:,
1
]]
picks
=
self
.
nms_temporal
(
starts
,
ends
,
sim_v
,
IoU
-
0.05
)
if
1
<
len
(
picks
):
#top1
picks
=
picks
[
0
:
1
]
for
index
in
picks
:
pred_start
=
sentence_image_reg_mat
[
k
,
index
,
0
]
pred_end
=
sentence_image_reg_mat
[
k
,
index
,
1
]
res
[
movie_name
].
append
((
k
,
pred_start
,
pred_end
))
logger
.
info
(
'movie_name: {}, sentence_id: {}, pred_start_time: {}, pred_end_time: {}, gt_start_time: {}, gt_end_time: {}'
.
format
(
'%s'
%
movie_name
,
'%s'
%
str
(
k
),
'%s'
%
str
(
pred_start
),
'%s'
%
str
(
pred_end
),
'%s'
%
str
(
gt_start
),
'%s'
%
str
(
gt_end
)))
self
.
filepath
=
os
.
path
.
join
(
savedir
,
self
.
out_file
)
with
open
(
self
.
filepath
,
'w'
)
as
f
:
f
.
write
(
json
.
dumps
(
{
'version'
:
'VERSION 1.0'
,
'results'
:
res
,
'external_data'
:
{}
},
indent
=
2
))
logger
.
info
(
'results has been saved into file: {}'
.
format
(
self
.
filepath
))
def
get_computed_metrics
(
self
):
return
self
.
save_res
PaddleCV/PaddleVideo/models/__init__.py
浏览文件 @
5d166b57
...
...
@@ -10,6 +10,8 @@ from .ctcn import CTCN
from
.bmn
import
BMN
from
.bsn
import
BsnTem
from
.bsn
import
BsnPem
from
.ets
import
ETS
from
.tall
import
TALL
# regist models, sort by alphabet
regist_model
(
"AttentionCluster"
,
AttentionCluster
)
...
...
@@ -23,3 +25,5 @@ regist_model("CTCN", CTCN)
regist_model
(
"BMN"
,
BMN
)
regist_model
(
"BsnTem"
,
BsnTem
)
regist_model
(
"BsnPem"
,
BsnPem
)
regist_model
(
"ETS"
,
ETS
)
regist_model
(
"TALL"
,
TALL
)
PaddleCV/PaddleVideo/models/attention_cluster/attention_cluster.py
浏览文件 @
5d166b57
...
...
@@ -41,21 +41,22 @@ class AttentionCluster(ModelBase):
self
.
learning_rate
=
self
.
get_config_from_sec
(
'train'
,
'learning_rate'
,
1e-3
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
def
build_input
(
self
,
use_
datalo
ader
=
True
):
self
.
feature_input
=
[]
for
name
,
dim
in
zip
(
self
.
feature_names
,
self
.
feature_dims
):
self
.
feature_input
.
append
(
fluid
.
layers
.
data
(
shape
=
[
self
.
seg_num
,
dim
],
dtype
=
'float32'
,
name
=
name
))
fluid
.
data
(
shape
=
[
None
,
self
.
seg_num
,
dim
],
dtype
=
'float32'
,
name
=
name
))
if
self
.
mode
!=
'infer'
:
self
.
label_input
=
fluid
.
layers
.
data
(
shape
=
[
self
.
class_num
],
dtype
=
'float32'
,
name
=
'label'
)
self
.
label_input
=
fluid
.
data
(
shape
=
[
None
,
self
.
class_num
],
dtype
=
'float32'
,
name
=
'label'
)
else
:
self
.
label_input
=
None
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
self
.
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
self
.
feature_input
+
[
self
.
label_input
],
capacity
=
8
,
iterable
=
True
)
...
...
PaddleCV/PaddleVideo/models/attention_lstm/attention_lstm.py
浏览文件 @
5d166b57
...
...
@@ -52,21 +52,21 @@ class AttentionLSTM(ModelBase):
self
.
decay_gamma
=
self
.
get_config_from_sec
(
'train'
,
'decay_gamma'
,
0.1
)
def
build_input
(
self
,
use_
pyre
ader
):
def
build_input
(
self
,
use_
datalo
ader
):
self
.
feature_input
=
[]
for
name
,
dim
in
zip
(
self
.
feature_names
,
self
.
feature_dims
):
self
.
feature_input
.
append
(
fluid
.
layers
.
data
(
shape
=
[
dim
],
lod_level
=
1
,
dtype
=
'float32'
,
name
=
name
))
fluid
.
data
(
shape
=
[
None
,
dim
],
lod_level
=
1
,
dtype
=
'float32'
,
name
=
name
))
if
self
.
mode
!=
'infer'
:
self
.
label_input
=
fluid
.
layers
.
data
(
shape
=
[
self
.
num_classes
],
dtype
=
'float32'
,
name
=
'label'
)
self
.
label_input
=
fluid
.
data
(
shape
=
[
None
,
self
.
num_classes
],
dtype
=
'float32'
,
name
=
'label'
)
else
:
self
.
label_input
=
None
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
self
.
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
self
.
feature_input
+
[
self
.
label_input
],
capacity
=
8
,
iterable
=
True
)
...
...
PaddleCV/PaddleVideo/models/bmn/README.md
浏览文件 @
5d166b57
...
...
@@ -75,6 +75,7 @@ BMN的训练数据采用ActivityNet1.3提供的数据集,数据下载及准备
-
使用CPU进行评估时,请将上面的命令行或者run.sh脚本中的
`use_gpu`
设置为False。
-
注:评估时可能会出现loss为nan的情况。这是由于评估时用的是单个样本,可能存在没有iou>0.6的样本,所以为nan,对最终的评估结果没有影响。
在ActivityNet1.3数据集下评估精度如下:
...
...
PaddleCV/PaddleVideo/models/bmn/bmn.py
浏览文件 @
5d166b57
...
...
@@ -52,47 +52,46 @@ class BMN(ModelBase):
'l2_weight_decay'
)
self
.
lr_decay_iter
=
self
.
get_config_from_sec
(
'train'
,
'lr_decay_iter'
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
feat_shape
=
[
self
.
feat_dim
,
self
.
tscale
]
gt_iou_map_shape
=
[
self
.
dscale
,
self
.
tscale
]
gt_start_shape
=
[
self
.
tscale
]
gt_end_shape
=
[
self
.
tscale
]
fileid_shape
=
[
1
]
self
.
use_
pyreader
=
use_pyre
ader
def
build_input
(
self
,
use_
datalo
ader
=
True
):
feat_shape
=
[
None
,
self
.
feat_dim
,
self
.
tscale
]
gt_iou_map_shape
=
[
None
,
self
.
dscale
,
self
.
tscale
]
gt_start_shape
=
[
None
,
self
.
tscale
]
gt_end_shape
=
[
None
,
self
.
tscale
]
fileid_shape
=
[
None
,
1
]
self
.
use_
dataloader
=
use_datalo
ader
# set init data to None
py_reader
=
None
feat
=
None
gt_iou_map
=
None
gt_start
=
None
gt_end
=
None
fileid
=
None
feat
=
fluid
.
layers
.
data
(
name
=
'feat'
,
shape
=
feat_shape
,
dtype
=
'float32'
)
feat
=
fluid
.
data
(
name
=
'feat'
,
shape
=
feat_shape
,
dtype
=
'float32'
)
feed_list
=
[]
feed_list
.
append
(
feat
)
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
gt_start
=
fluid
.
layers
.
data
(
gt_start
=
fluid
.
data
(
name
=
'gt_start'
,
shape
=
gt_start_shape
,
dtype
=
'float32'
)
gt_end
=
fluid
.
layers
.
data
(
gt_end
=
fluid
.
data
(
name
=
'gt_end'
,
shape
=
gt_end_shape
,
dtype
=
'float32'
)
gt_iou_map
=
fluid
.
layers
.
data
(
gt_iou_map
=
fluid
.
data
(
name
=
'gt_iou_map'
,
shape
=
gt_iou_map_shape
,
dtype
=
'float32'
)
feed_list
.
append
(
gt_iou_map
)
feed_list
.
append
(
gt_start
)
feed_list
.
append
(
gt_end
)
elif
self
.
mode
==
'test'
:
gt_start
=
fluid
.
layers
.
data
(
gt_start
=
fluid
.
data
(
name
=
'gt_start'
,
shape
=
gt_start_shape
,
dtype
=
'float32'
)
gt_end
=
fluid
.
layers
.
data
(
gt_end
=
fluid
.
data
(
name
=
'gt_end'
,
shape
=
gt_end_shape
,
dtype
=
'float32'
)
gt_iou_map
=
fluid
.
layers
.
data
(
gt_iou_map
=
fluid
.
data
(
name
=
'gt_iou_map'
,
shape
=
gt_iou_map_shape
,
dtype
=
'float32'
)
feed_list
.
append
(
gt_iou_map
)
feed_list
.
append
(
gt_start
)
feed_list
.
append
(
gt_end
)
fileid
=
fluid
.
layers
.
data
(
fileid
=
fluid
.
data
(
name
=
'fileid'
,
shape
=
fileid_shape
,
dtype
=
'int64'
)
feed_list
.
append
(
fileid
)
elif
self
.
mode
==
'infer'
:
...
...
@@ -102,10 +101,10 @@ class BMN(ModelBase):
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
self
.
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
feed_list
,
capacity
=
8
,
iterable
=
True
)
self
.
feat_input
=
[
feat
]
...
...
PaddleCV/PaddleVideo/models/bmn/bmn_utils.py
浏览文件 @
5d166b57
...
...
@@ -27,7 +27,6 @@ def iou_with_anchors(anchors_min, anchors_max, box_min, box_max):
int_xmax
=
np
.
minimum
(
anchors_max
,
box_max
)
inter_len
=
np
.
maximum
(
int_xmax
-
int_xmin
,
0.
)
union_len
=
len_anchors
-
inter_len
+
box_max
-
box_min
#print inter_len,union_len
jaccard
=
np
.
divide
(
inter_len
,
union_len
)
return
jaccard
...
...
PaddleCV/PaddleVideo/models/bsn/bsn.py
浏览文件 @
5d166b57
...
...
@@ -47,47 +47,46 @@ class BsnTem(ModelBase):
'l2_weight_decay'
)
self
.
lr_decay_iter
=
self
.
get_config_from_sec
(
'train'
,
'lr_decay_iter'
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
feat_shape
=
[
self
.
feat_dim
,
self
.
tscale
]
gt_start_shape
=
[
self
.
tscale
]
gt_end_shape
=
[
self
.
tscale
]
gt_action_shape
=
[
self
.
tscale
]
fileid_shape
=
[
1
]
self
.
use_
pyreader
=
use_pyre
ader
def
build_input
(
self
,
use_
datalo
ader
=
True
):
feat_shape
=
[
None
,
self
.
feat_dim
,
self
.
tscale
]
gt_start_shape
=
[
None
,
self
.
tscale
]
gt_end_shape
=
[
None
,
self
.
tscale
]
gt_action_shape
=
[
None
,
self
.
tscale
]
fileid_shape
=
[
None
,
1
]
self
.
use_
dataloader
=
use_datalo
ader
# set init data to None
py_reader
=
None
feat
=
None
gt_start
=
None
gt_end
=
None
gt_action
=
None
fileid
=
None
feat
=
fluid
.
layers
.
data
(
name
=
'feat'
,
shape
=
feat_shape
,
dtype
=
'float32'
)
feat
=
fluid
.
data
(
name
=
'feat'
,
shape
=
feat_shape
,
dtype
=
'float32'
)
feed_list
=
[]
feed_list
.
append
(
feat
)
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
gt_start
=
fluid
.
layers
.
data
(
gt_start
=
fluid
.
data
(
name
=
'gt_start'
,
shape
=
gt_start_shape
,
dtype
=
'float32'
)
gt_end
=
fluid
.
layers
.
data
(
gt_end
=
fluid
.
data
(
name
=
'gt_end'
,
shape
=
gt_end_shape
,
dtype
=
'float32'
)
gt_action
=
fluid
.
layers
.
data
(
gt_action
=
fluid
.
data
(
name
=
'gt_action'
,
shape
=
gt_action_shape
,
dtype
=
'float32'
)
feed_list
.
append
(
gt_start
)
feed_list
.
append
(
gt_end
)
feed_list
.
append
(
gt_action
)
elif
self
.
mode
==
'test'
:
gt_start
=
fluid
.
layers
.
data
(
gt_start
=
fluid
.
data
(
name
=
'gt_start'
,
shape
=
gt_start_shape
,
dtype
=
'float32'
)
gt_end
=
fluid
.
layers
.
data
(
gt_end
=
fluid
.
data
(
name
=
'gt_end'
,
shape
=
gt_end_shape
,
dtype
=
'float32'
)
gt_action
=
fluid
.
layers
.
data
(
gt_action
=
fluid
.
data
(
name
=
'gt_action'
,
shape
=
gt_action_shape
,
dtype
=
'float32'
)
feed_list
.
append
(
gt_start
)
feed_list
.
append
(
gt_end
)
feed_list
.
append
(
gt_action
)
fileid
=
fluid
.
layers
.
data
(
fileid
=
fluid
.
data
(
name
=
'fileid'
,
shape
=
fileid_shape
,
dtype
=
'int64'
)
feed_list
.
append
(
fileid
)
elif
self
.
mode
==
'infer'
:
...
...
@@ -97,10 +96,10 @@ class BsnTem(ModelBase):
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
self
.
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
feed_list
,
capacity
=
8
,
iterable
=
True
)
self
.
feat_input
=
[
feat
]
...
...
@@ -212,50 +211,47 @@ class BsnPem(ModelBase):
'l2_weight_decay'
)
self
.
lr_decay_iter
=
self
.
get_config_from_sec
(
'train'
,
'lr_decay_iter'
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
feat_shape
=
[
self
.
top_K
,
self
.
feat_dim
]
gt_iou_shape
=
[
self
.
top_K
,
1
]
props_info_shape
=
[
self
.
top_K
,
4
]
fileid_shape
=
[
1
]
self
.
use_
pyreader
=
use_pyre
ader
def
build_input
(
self
,
use_
datalo
ader
=
True
):
feat_shape
=
[
None
,
self
.
top_K
,
self
.
feat_dim
]
gt_iou_shape
=
[
None
,
self
.
top_K
,
1
]
props_info_shape
=
[
None
,
self
.
top_K
,
4
]
fileid_shape
=
[
None
,
1
]
self
.
use_
dataloader
=
use_datalo
ader
# set init data to None
py_reader
=
None
feat
=
None
gt_iou
=
None
props_info
=
None
fileid
=
None
feat
=
fluid
.
layers
.
data
(
name
=
'feat'
,
shape
=
feat_shape
,
dtype
=
'float32'
)
feat
=
fluid
.
data
(
name
=
'feat'
,
shape
=
feat_shape
,
dtype
=
'float32'
)
feed_list
=
[]
feed_list
.
append
(
feat
)
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
gt_iou
=
fluid
.
layers
.
data
(
gt_iou
=
fluid
.
data
(
name
=
'gt_iou'
,
shape
=
gt_iou_shape
,
dtype
=
'float32'
)
feed_list
.
append
(
gt_iou
)
elif
self
.
mode
==
'test'
:
gt_iou
=
fluid
.
layers
.
data
(
gt_iou
=
fluid
.
data
(
name
=
'gt_iou'
,
shape
=
gt_iou_shape
,
dtype
=
'float32'
)
props_info
=
fluid
.
layers
.
data
(
props_info
=
fluid
.
data
(
name
=
'props_info'
,
shape
=
props_info_shape
,
dtype
=
'float32'
)
feed_list
.
append
(
gt_iou
)
feed_list
.
append
(
props_info
)
fileid
=
fluid
.
layers
.
data
(
fileid
=
fluid
.
data
(
name
=
'fileid'
,
shape
=
fileid_shape
,
dtype
=
'int64'
)
feed_list
.
append
(
fileid
)
elif
self
.
mode
==
'infer'
:
props_info
=
fluid
.
layers
.
data
(
name
=
'props_info'
,
shape
=
props_info_shape
,
dtype
=
'float32'
)
feed_list
.
append
(
props_info
)
pass
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
self
.
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
feed_list
,
capacity
=
4
,
iterable
=
True
)
self
.
feat_input
=
[
feat
]
...
...
@@ -306,7 +302,7 @@ class BsnPem(ModelBase):
elif
self
.
mode
==
'test'
:
return
self
.
feat_input
+
[
self
.
gt_iou
,
self
.
props_info
,
self
.
fileid
]
elif
self
.
mode
==
'infer'
:
return
self
.
feat_input
+
[
self
.
props_info
]
return
self
.
feat_input
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
...
...
@@ -323,8 +319,7 @@ class BsnPem(ModelBase):
[
self
.
props_info
,
self
.
fileid
]
elif
self
.
mode
==
'infer'
:
preds
=
self
.
outputs
()
fetch_list
=
[
item
for
item
in
preds
]
+
\
[
self
.
props_info
]
fetch_list
=
[
item
for
item
in
preds
]
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
...
...
PaddleCV/PaddleVideo/models/bsn/bsn_utils.py
浏览文件 @
5d166b57
...
...
@@ -31,7 +31,6 @@ def iou_with_anchors(anchors_min, anchors_max, box_min, box_max):
int_xmax
=
np
.
minimum
(
anchors_max
,
box_max
)
inter_len
=
np
.
maximum
(
int_xmax
-
int_xmin
,
0.
)
union_len
=
len_anchors
-
inter_len
+
box_max
-
box_min
#print inter_len,union_len
jaccard
=
np
.
divide
(
inter_len
,
union_len
)
return
jaccard
...
...
PaddleCV/PaddleVideo/models/ctcn/ctcn.py
浏览文件 @
5d166b57
...
...
@@ -51,37 +51,35 @@ class CTCN(ModelBase):
self
.
momentum
=
self
.
get_config_from_sec
(
'train'
,
'momentum'
)
self
.
lr_decay_iter
=
self
.
get_config_from_sec
(
'train'
,
'lr_decay_iter'
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
image_shape
=
[
1
,
self
.
img_size
,
self
.
concept_size
]
loc_shape
=
[
self
.
total_num_anchors
,
2
]
cls_shape
=
[
self
.
total_num_anchors
]
fileid_shape
=
[
1
]
self
.
use_
pyreader
=
use_pyre
ader
def
build_input
(
self
,
use_
datalo
ader
=
True
):
image_shape
=
[
None
,
1
,
self
.
img_size
,
self
.
concept_size
]
loc_shape
=
[
None
,
self
.
total_num_anchors
,
2
]
cls_shape
=
[
None
,
self
.
total_num_anchors
]
fileid_shape
=
[
None
,
1
]
self
.
use_
dataloader
=
use_datalo
ader
# set init data to None
py_reader
=
None
image
=
None
loc_targets
=
None
cls_targets
=
None
fileid
=
None
image
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
image_shape
,
dtype
=
'float32'
)
image
=
fluid
.
data
(
name
=
'image'
,
shape
=
image_shape
,
dtype
=
'float32'
)
feed_list
=
[]
feed_list
.
append
(
image
)
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
loc_targets
=
fluid
.
layers
.
data
(
loc_targets
=
fluid
.
data
(
name
=
'loc_targets'
,
shape
=
loc_shape
,
dtype
=
'float32'
)
cls_targets
=
fluid
.
layers
.
data
(
cls_targets
=
fluid
.
data
(
name
=
'cls_targets'
,
shape
=
cls_shape
,
dtype
=
'int64'
)
feed_list
.
append
(
loc_targets
)
feed_list
.
append
(
cls_targets
)
elif
self
.
mode
==
'test'
:
loc_targets
=
fluid
.
layers
.
data
(
loc_targets
=
fluid
.
data
(
name
=
'loc_targets'
,
shape
=
loc_shape
,
dtype
=
'float32'
)
cls_targets
=
fluid
.
layers
.
data
(
cls_targets
=
fluid
.
data
(
name
=
'cls_targets'
,
shape
=
cls_shape
,
dtype
=
'int64'
)
fileid
=
fluid
.
layers
.
data
(
fileid
=
fluid
.
data
(
name
=
'fileid'
,
shape
=
fileid_shape
,
dtype
=
'int64'
)
feed_list
.
append
(
loc_targets
)
feed_list
.
append
(
cls_targets
)
...
...
@@ -93,10 +91,10 @@ class CTCN(ModelBase):
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
self
.
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
feed_list
,
capacity
=
4
,
iterable
=
True
)
self
.
feature_input
=
[
image
]
...
...
PaddleCV/PaddleVideo/models/ets/README.md
0 → 100644
浏览文件 @
5d166b57
# ETS 视频描述模型
---
## 内容
-
[
模型简介
](
#模型简介
)
-
[
数据准备
](
#数据准备
)
-
[
模型训练
](
#模型训练
)
-
[
模型评估
](
#模型评估
)
-
[
模型推断
](
#模型推断
)
-
[
参考论文
](
#参考论文
)
## 模型简介
Describing Videos by Exploiting Temporal Structure是由蒙特利尔大学Li Yao等人提出的用于对视频片段生成文字描述的经典模型,这里简称为ETS。此模型基于编码器-解码器的思想,对输入的视频,先使用3D卷积提取视频的局部时空特征,然后在时序维度上引入注意力机制,利用LSTM在全局尺度上对局部特征进行融合,最后输出文字描述。
详细内容请参考
[
Describing Videos by Exploiting Temporal Structure
](
https://arxiv.org/abs/1502.08029
)
。
## 数据准备
ETS的训练数据采用ActivityNet Captions提供的数据集,数据下载及准备请参考
[
数据说明
](
../../data/dataset/ets/README.md
)
## 模型训练
数据准备完毕后,可以通过如下两种方式启动训练:
export CUDA_VISIBLE_DEVICES=0,1,2,3
export FLAGS_fast_eager_deletion_mode=1
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py --model_name=ETS \
--config=./configs/ets.yaml \
--log_interval=10 \
--valid_interval=1 \
--use_gpu=True \
--save_dir=./data/checkpoints \
--fix_random_seed=False
bash run.sh train ETS ./configs/ets.yaml
-
从头开始训练,使用上述启动命令行或者脚本程序即可启动训练,不需要用到预训练模型
-
可下载已发布模型
[
model
](
https://paddlemodels.bj.bcebos.com/video_caption/ETS_final.pdparams
)
通过
`--resume`
指定权重存放路径进行finetune等开发
**训练策略:**
*
采用Adam优化算法训练
*
权重衰减系数为1e-4
*
学习率调整使用Noam衰减方法
## 模型评估
可通过如下两种方式进行模型评估:
python eval.py --model_name=ETS \
--config=./configs/ets.yaml \
--log_interval=1 \
--weights=$PATH_TO_WEIGHTS \
--use_gpu=True
bash run.sh eval ETS ./configs/ets.yaml
-
使用
`run.sh`
进行评估时,需要修改脚本中的
`weights`
参数指定需要评估的权重。
-
若未指定
`--weights`
参数,脚本会下载已发布模型
[
model
](
https://paddlemodels.bj.bcebos.com/video_caption/ETS_final.pdparams
)
进行评估
-
运行上述程序会将测试结果保存在json文件中,默认存储在data/evaluate
\_
results目录下。使用ActivityNet Captions官方提供的测试脚本,即可计算METEOR。具体计算过程请参考
[
指标计算
](
../../metrics/ets_metrics/README.md
)
-
使用CPU进行评估时,请将上面的命令行或者run.sh脚本中的
`use_gpu`
设置为False
在ActivityNet Captions数据集下评估精度如下:
| METEOR |
| :----: |
| 9.8 |
## 模型推断
可通过如下两种方式启动模型推断:
python predict.py --model_name=ETS \
--config=./configs/ets.yaml \
--log_interval=1 \
--weights=$PATH_TO_WEIGHTS \
--filelist=$FILELIST \
--use_gpu=True
bash run.sh predict ETS ./configs/ets.yaml
-
使用python命令行启动程序时,
`--filelist`
参数指定待推断的文件列表。用户也可参考
[
数据说明
](
../../data/dataset/ets/README.md
)
步骤三生成默认的推断文件列表。
`--weights`
参数为训练好的权重参数,如果不设置,程序会自动下载已训练好的权重。
-
使用
`run.sh`
进行评估时,需要修改脚本中的
`weights`
参数指定需要用到的权重。
-
若未指定
`--weights`
参数,脚本会下载已发布模型
[
model
](
https://paddlemodels.bj.bcebos.com/video_caption/ETS_final.pdparams
)
进行推断
-
模型推断结果存储于json文件中,默认存储在
`data/dataset/predict_results`
目录下
-
使用CPU进行推断时,请将命令行或者run.sh脚本中的
`use_gpu`
设置为False
## 参考论文
-
[
Describing Videos by Exploiting Temporal Structure
](
https://arxiv.org/abs/1502.08029
)
。
PaddleCV/PaddleVideo/models/ets/__init__.py
0 → 100644
浏览文件 @
5d166b57
from
.ets
import
*
PaddleCV/PaddleVideo/models/ets/ets.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
ParamAttr
import
numpy
as
np
from
..model
import
ModelBase
from
.
import
ets_net
import
logging
logger
=
logging
.
getLogger
(
__name__
)
__all__
=
[
"ETS"
]
class
ETS
(
ModelBase
):
"""ETS model"""
def
__init__
(
self
,
name
,
cfg
,
mode
=
'train'
):
super
(
ETS
,
self
).
__init__
(
name
,
cfg
,
mode
=
mode
)
self
.
get_config
()
def
get_config
(
self
):
self
.
feat_size
=
self
.
get_config_from_sec
(
'MODEL'
,
'feat_size'
)
self
.
fc_dim
=
self
.
get_config_from_sec
(
'MODEL'
,
'fc_dim'
)
self
.
gru_hidden_dim
=
self
.
get_config_from_sec
(
'MODEL'
,
'gru_hidden_dim'
)
self
.
decoder_size
=
self
.
get_config_from_sec
(
'MODEL'
,
'decoder_size'
)
self
.
word_emb_dim
=
self
.
get_config_from_sec
(
'MODEL'
,
'word_emb_dim'
)
self
.
dict_file
=
self
.
get_config_from_sec
(
'MODEL'
,
'dict_file'
)
self
.
max_length
=
self
.
get_config_from_sec
(
'MODEL'
,
'max_length'
)
self
.
beam_size
=
self
.
get_config_from_sec
(
'MODEL'
,
'beam_size'
)
self
.
num_epochs
=
self
.
get_config_from_sec
(
'train'
,
'epoch'
)
self
.
l2_weight_decay
=
self
.
get_config_from_sec
(
'train'
,
'l2_weight_decay'
)
self
.
clip_norm
=
self
.
get_config_from_sec
(
'train'
,
'clip_norm'
)
def
build_input
(
self
,
use_dataloader
=
True
):
feat_shape
=
[
None
,
self
.
feat_size
]
word_shape
=
[
None
,
1
]
word_next_shape
=
[
None
,
1
]
# set init data to None
py_reader
=
None
feat
=
None
word
=
None
word_next
=
None
init_ids
=
None
init_scores
=
None
self
.
use_dataloader
=
use_dataloader
feat
=
fluid
.
data
(
name
=
'feat'
,
shape
=
feat_shape
,
dtype
=
'float32'
,
lod_level
=
1
)
feed_list
=
[]
feed_list
.
append
(
feat
)
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
word
=
fluid
.
data
(
name
=
'word'
,
shape
=
word_shape
,
dtype
=
'int64'
,
lod_level
=
1
)
word_next
=
fluid
.
data
(
name
=
'word_next'
,
shape
=
word_next_shape
,
dtype
=
'int64'
,
lod_level
=
1
)
feed_list
.
append
(
word
)
feed_list
.
append
(
word_next
)
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
init_ids
=
fluid
.
data
(
name
=
"init_ids"
,
shape
=
[
None
,
1
],
dtype
=
"int64"
,
lod_level
=
2
)
init_scores
=
fluid
.
data
(
name
=
"init_scores"
,
shape
=
[
None
,
1
],
dtype
=
"float32"
,
lod_level
=
2
)
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
if
use_dataloader
:
assert
self
.
mode
!=
'infer'
,
\
'dataloader is not recommendated when infer, please set use_dataloader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
feed_list
,
capacity
=
16
,
iterable
=
True
)
self
.
feature_input
=
[
feat
]
self
.
word
=
word
self
.
word_next
=
word_next
self
.
init_ids
=
init_ids
self
.
init_scores
=
init_scores
def
create_model_args
(
self
):
cfg
=
{}
cfg
[
'feat_size'
]
=
self
.
feat_size
cfg
[
'fc_dim'
]
=
self
.
fc_dim
cfg
[
'gru_hidden_dim'
]
=
self
.
gru_hidden_dim
cfg
[
'decoder_size'
]
=
self
.
decoder_size
cfg
[
'word_emb_dim'
]
=
self
.
word_emb_dim
word_dict
=
dict
()
with
open
(
self
.
dict_file
,
'r'
)
as
f
:
for
i
,
line
in
enumerate
(
f
):
word_dict
[
line
.
strip
().
split
()[
0
]]
=
i
dict_size
=
len
(
word_dict
)
cfg
[
'dict_size'
]
=
dict_size
cfg
[
'max_length'
]
=
self
.
max_length
cfg
[
'beam_size'
]
=
self
.
beam_size
return
cfg
def
build_model
(
self
):
cfg
=
self
.
create_model_args
()
self
.
videomodel
=
ets_net
.
ETSNET
(
feat_size
=
cfg
[
'feat_size'
],
fc_dim
=
cfg
[
'fc_dim'
],
gru_hidden_dim
=
cfg
[
'gru_hidden_dim'
],
decoder_size
=
cfg
[
'decoder_size'
],
word_emb_dim
=
cfg
[
'word_emb_dim'
],
dict_size
=
cfg
[
'dict_size'
],
max_length
=
cfg
[
'max_length'
],
beam_size
=
cfg
[
'beam_size'
],
mode
=
self
.
mode
)
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
prob
=
self
.
videomodel
.
net
(
self
.
feature_input
[
0
],
self
.
word
)
self
.
network_outputs
=
[
prob
]
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
translation_ids
,
translation_scores
=
self
.
videomodel
.
net
(
self
.
feature_input
[
0
],
self
.
init_ids
,
self
.
init_scores
)
self
.
network_outputs
=
[
translation_ids
,
translation_scores
]
def
optimizer
(
self
):
l2_weight_decay
=
self
.
l2_weight_decay
clip_norm
=
self
.
clip_norm
fluid
.
clip
.
set_gradient_clip
(
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
clip_norm
))
lr_decay
=
fluid
.
layers
.
learning_rate_scheduler
.
noam_decay
(
self
.
gru_hidden_dim
,
1000
)
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
lr_decay
,
regularization
=
fluid
.
regularizer
.
L2DecayRegularizer
(
regularization_coeff
=
l2_weight_decay
))
return
optimizer
def
loss
(
self
):
assert
self
.
mode
!=
'infer'
,
"invalid loss calculationg in infer mode"
self
.
loss_
=
self
.
videomodel
.
loss
(
self
.
network_outputs
[
0
],
self
.
word_next
)
return
self
.
loss_
def
outputs
(
self
):
return
self
.
network_outputs
def
feeds
(
self
):
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
return
self
.
feature_input
+
[
self
.
word
,
self
.
word_next
]
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
return
self
.
feature_input
+
[
self
.
init_ids
,
self
.
init_scores
]
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
def
fetches
(
self
):
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
losses
=
self
.
loss
()
fetch_list
=
[
item
for
item
in
losses
]
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
preds
=
self
.
outputs
()
fetch_list
=
[
item
for
item
in
preds
]
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
return
fetch_list
def
pretrain_info
(
self
):
return
(
None
,
None
)
def
weights_info
(
self
):
return
(
'ETS_final.pdparams'
,
'https://paddlemodels.bj.bcebos.com/video_caption/ETS_final.pdparams'
)
PaddleCV/PaddleVideo/models/ets/ets_net.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import
paddle.fluid
as
fluid
from
paddle.fluid
import
ParamAttr
import
numpy
as
np
DATATYPE
=
'float32'
class
ETSNET
(
object
):
def
__init__
(
self
,
feat_size
,
fc_dim
,
gru_hidden_dim
,
max_length
,
beam_size
,
decoder_size
,
word_emb_dim
,
dict_size
,
mode
=
'train'
):
self
.
feat_size
=
feat_size
self
.
fc_dim
=
fc_dim
self
.
gru_hidden_dim
=
gru_hidden_dim
self
.
decoder_size
=
decoder_size
self
.
word_emb_dim
=
word_emb_dim
self
.
dict_size
=
dict_size
self
.
max_length
=
max_length
self
.
beam_size
=
beam_size
self
.
mode
=
mode
def
encoder
(
self
,
feat
):
bias_attr
=
fluid
.
ParamAttr
(
regularizer
=
fluid
.
regularizer
.
L2Decay
(
0.0
),
initializer
=
fluid
.
initializer
.
NormalInitializer
(
scale
=
0.0
))
input_fc
=
fluid
.
layers
.
fc
(
input
=
feat
,
size
=
self
.
fc_dim
,
act
=
'tanh'
,
bias_attr
=
bias_attr
)
gru_forward_fc
=
fluid
.
layers
.
fc
(
input
=
input_fc
,
size
=
self
.
gru_hidden_dim
*
3
,
bias_attr
=
False
)
gru_forward
=
fluid
.
layers
.
dynamic_gru
(
input
=
gru_forward_fc
,
size
=
self
.
gru_hidden_dim
,
is_reverse
=
False
)
gru_backward_fc
=
fluid
.
layers
.
fc
(
input
=
input_fc
,
size
=
self
.
gru_hidden_dim
*
3
,
bias_attr
=
False
)
gru_backward
=
fluid
.
layers
.
dynamic_gru
(
input
=
gru_backward_fc
,
size
=
self
.
gru_hidden_dim
,
is_reverse
=
True
)
encoded_sequence
=
fluid
.
layers
.
concat
(
input
=
[
gru_forward
,
gru_backward
],
axis
=
1
)
gru_weights
=
fluid
.
layers
.
fc
(
input
=
encoded_sequence
,
size
=
1
,
act
=
'sequence_softmax'
,
bias_attr
=
False
)
gru_scaled
=
fluid
.
layers
.
elementwise_mul
(
x
=
encoded_sequence
,
y
=
gru_weights
,
axis
=
0
)
encoded_vector
=
fluid
.
layers
.
sequence_pool
(
input
=
gru_scaled
,
pool_type
=
'sum'
)
encoded_proj
=
fluid
.
layers
.
fc
(
input
=
encoded_sequence
,
size
=
self
.
decoder_size
,
bias_attr
=
False
)
return
encoded_sequence
,
encoded_vector
,
encoded_proj
def
cell
(
self
,
x
,
hidden
,
encoder_out
,
encoder_out_proj
):
def
simple_attention
(
encoder_vec
,
encoder_proj
,
decoder_state
):
decoder_state_proj
=
fluid
.
layers
.
fc
(
input
=
decoder_state
,
size
=
self
.
decoder_size
,
bias_attr
=
False
)
decoder_state_expand
=
fluid
.
layers
.
sequence_expand
(
x
=
decoder_state_proj
,
y
=
encoder_proj
)
mixed_state
=
fluid
.
layers
.
elementwise_add
(
encoder_proj
,
decoder_state_expand
)
attention_weights
=
fluid
.
layers
.
fc
(
input
=
mixed_state
,
size
=
1
,
bias_attr
=
False
)
attention_weights
=
fluid
.
layers
.
sequence_softmax
(
input
=
attention_weights
)
weigths_reshape
=
fluid
.
layers
.
reshape
(
x
=
attention_weights
,
shape
=
[
-
1
])
scaled
=
fluid
.
layers
.
elementwise_mul
(
x
=
encoder_vec
,
y
=
weigths_reshape
,
axis
=
0
)
context
=
fluid
.
layers
.
sequence_pool
(
input
=
scaled
,
pool_type
=
'sum'
)
return
context
context
=
simple_attention
(
encoder_out
,
encoder_out_proj
,
hidden
)
out
=
fluid
.
layers
.
fc
(
input
=
[
x
,
context
],
size
=
self
.
decoder_size
*
3
,
bias_attr
=
False
)
out
=
fluid
.
layers
.
gru_unit
(
input
=
out
,
hidden
=
hidden
,
size
=
self
.
decoder_size
*
3
)[
0
]
return
out
,
out
def
train_decoder
(
self
,
word
,
encoded_sequence
,
encoded_vector
,
encoded_proj
):
decoder_boot
=
fluid
.
layers
.
fc
(
input
=
encoded_vector
,
size
=
self
.
decoder_size
,
act
=
'tanh'
,
bias_attr
=
False
)
word_embedding
=
fluid
.
layers
.
embedding
(
input
=
word
,
size
=
[
self
.
dict_size
,
self
.
word_emb_dim
])
pad_value
=
fluid
.
layers
.
assign
(
input
=
np
.
array
([
0.
],
dtype
=
np
.
float32
))
word_embedding
,
length
=
fluid
.
layers
.
sequence_pad
(
word_embedding
,
pad_value
)
word_embedding
=
fluid
.
layers
.
transpose
(
word_embedding
,
[
1
,
0
,
2
])
rnn
=
fluid
.
layers
.
StaticRNN
()
with
rnn
.
step
():
x
=
rnn
.
step_input
(
word_embedding
)
pre_state
=
rnn
.
memory
(
init
=
decoder_boot
)
out
,
current_state
=
self
.
cell
(
x
,
pre_state
,
encoded_sequence
,
encoded_proj
)
prob
=
fluid
.
layers
.
fc
(
input
=
out
,
size
=
self
.
dict_size
,
act
=
'softmax'
)
rnn
.
update_memory
(
pre_state
,
current_state
)
rnn
.
step_output
(
prob
)
rnn_out
=
rnn
()
rnn_out
=
fluid
.
layers
.
transpose
(
rnn_out
,
[
1
,
0
,
2
])
length
=
fluid
.
layers
.
reshape
(
length
,
[
-
1
])
rnn_out
=
fluid
.
layers
.
sequence_unpad
(
x
=
rnn_out
,
length
=
length
)
return
rnn_out
def
infer_decoder
(
self
,
init_ids
,
init_scores
,
encoded_sequence
,
encoded_vector
,
encoded_proj
):
decoder_boot
=
fluid
.
layers
.
fc
(
input
=
encoded_vector
,
size
=
self
.
decoder_size
,
act
=
'tanh'
,
bias_attr
=
False
)
max_len
=
fluid
.
layers
.
fill_constant
(
shape
=
[
1
],
dtype
=
'int64'
,
value
=
self
.
max_length
)
counter
=
fluid
.
layers
.
zeros
(
shape
=
[
1
],
dtype
=
'int64'
,
force_cpu
=
True
)
# create and init arrays to save selected ids, scores and states for each step
ids_array
=
fluid
.
layers
.
array_write
(
init_ids
,
i
=
counter
)
scores_array
=
fluid
.
layers
.
array_write
(
init_scores
,
i
=
counter
)
state_array
=
fluid
.
layers
.
array_write
(
decoder_boot
,
i
=
counter
)
cond
=
fluid
.
layers
.
less_than
(
x
=
counter
,
y
=
max_len
)
while_op
=
fluid
.
layers
.
While
(
cond
=
cond
)
with
while_op
.
block
():
pre_ids
=
fluid
.
layers
.
array_read
(
array
=
ids_array
,
i
=
counter
)
pre_score
=
fluid
.
layers
.
array_read
(
array
=
scores_array
,
i
=
counter
)
pre_state
=
fluid
.
layers
.
array_read
(
array
=
state_array
,
i
=
counter
)
pre_ids_emb
=
fluid
.
layers
.
embedding
(
input
=
pre_ids
,
size
=
[
self
.
dict_size
,
self
.
word_emb_dim
])
out
,
current_state
=
self
.
cell
(
pre_ids_emb
,
pre_state
,
encoded_sequence
,
encoded_proj
)
prob
=
fluid
.
layers
.
fc
(
input
=
out
,
size
=
self
.
dict_size
,
act
=
'softmax'
)
# beam search
topk_scores
,
topk_indices
=
fluid
.
layers
.
topk
(
prob
,
k
=
self
.
beam_size
)
accu_scores
=
fluid
.
layers
.
elementwise_add
(
x
=
fluid
.
layers
.
log
(
topk_scores
),
y
=
fluid
.
layers
.
reshape
(
pre_score
,
shape
=
[
-
1
]),
axis
=
0
)
accu_scores
=
fluid
.
layers
.
lod_reset
(
x
=
accu_scores
,
y
=
pre_ids
)
selected_ids
,
selected_scores
=
fluid
.
layers
.
beam_search
(
pre_ids
,
pre_score
,
topk_indices
,
accu_scores
,
self
.
beam_size
,
end_id
=
1
)
fluid
.
layers
.
increment
(
x
=
counter
,
value
=
1
,
in_place
=
True
)
# save selected ids and corresponding scores of each step
fluid
.
layers
.
array_write
(
selected_ids
,
array
=
ids_array
,
i
=
counter
)
fluid
.
layers
.
array_write
(
selected_scores
,
array
=
scores_array
,
i
=
counter
)
# update rnn state by sequence_expand acting as gather
current_state
=
fluid
.
layers
.
sequence_expand
(
current_state
,
selected_scores
)
fluid
.
layers
.
array_write
(
current_state
,
array
=
state_array
,
i
=
counter
)
current_enc_seq
=
fluid
.
layers
.
sequence_expand
(
encoded_sequence
,
selected_scores
)
fluid
.
layers
.
assign
(
current_enc_seq
,
encoded_sequence
)
current_enc_proj
=
fluid
.
layers
.
sequence_expand
(
encoded_proj
,
selected_scores
)
fluid
.
layers
.
assign
(
current_enc_proj
,
encoded_proj
)
# update conditional variable
length_cond
=
fluid
.
layers
.
less_than
(
x
=
counter
,
y
=
max_len
)
finish_cond
=
fluid
.
layers
.
logical_not
(
fluid
.
layers
.
is_empty
(
x
=
selected_ids
))
fluid
.
layers
.
logical_and
(
x
=
length_cond
,
y
=
finish_cond
,
out
=
cond
)
translation_ids
,
translation_scores
=
fluid
.
layers
.
beam_search_decode
(
ids
=
ids_array
,
scores
=
scores_array
,
beam_size
=
self
.
beam_size
,
end_id
=
1
)
return
translation_ids
,
translation_scores
def
net
(
self
,
feat
,
*
input_decoder
):
encoded_sequence
,
encoded_vector
,
encoded_proj
=
self
.
encoder
(
feat
)
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
word
,
=
input_decoder
prob
=
self
.
train_decoder
(
word
,
encoded_sequence
,
encoded_vector
,
encoded_proj
)
return
prob
else
:
init_ids
,
init_scores
=
input_decoder
translation_ids
,
translation_scores
=
self
.
infer_decoder
(
init_ids
,
init_scores
,
encoded_sequence
,
encoded_vector
,
encoded_proj
)
return
translation_ids
,
translation_scores
def
loss
(
self
,
prob
,
word_next
):
cost
=
fluid
.
layers
.
cross_entropy
(
input
=
prob
,
label
=
word_next
)
avg_cost
=
fluid
.
layers
.
mean
(
cost
)
return
[
avg_cost
]
PaddleCV/PaddleVideo/models/model.py
浏览文件 @
5d166b57
...
...
@@ -65,13 +65,13 @@ class ModelBase(object):
self
.
is_training
=
(
mode
==
'train'
)
self
.
mode
=
mode
self
.
cfg
=
cfg
self
.
py_re
ader
=
None
self
.
datalo
ader
=
None
def
build_model
(
self
):
"build model struct"
raise
NotImplementError
(
self
,
self
.
build_model
)
def
build_input
(
self
,
use_
pyre
ader
):
def
build_input
(
self
,
use_
datalo
ader
):
"build input Variable"
raise
NotImplementError
(
self
,
self
.
build_input
)
...
...
@@ -114,8 +114,8 @@ class ModelBase(object):
wget
.
download
(
url
,
path
)
return
path
def
pyre
ader
(
self
):
return
self
.
py_re
ader
def
datalo
ader
(
self
):
return
self
.
datalo
ader
def
epoch_num
(
self
):
"get train epoch num"
...
...
PaddleCV/PaddleVideo/models/nextvlad/nextvlad.py
浏览文件 @
5d166b57
...
...
@@ -61,17 +61,17 @@ class NEXTVLAD(ModelBase):
# other params
self
.
batch_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'batch_size'
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
rgb_shape
=
[
self
.
video_feature_size
]
audio_shape
=
[
self
.
audio_feature_size
]
label_shape
=
[
self
.
num_classes
]
def
build_input
(
self
,
use_
datalo
ader
=
True
):
rgb_shape
=
[
None
,
self
.
video_feature_size
]
audio_shape
=
[
None
,
self
.
audio_feature_size
]
label_shape
=
[
None
,
self
.
num_classes
]
rgb
=
fluid
.
layers
.
data
(
rgb
=
fluid
.
data
(
name
=
'train_rgb'
if
self
.
is_training
else
'test_rgb'
,
shape
=
rgb_shape
,
dtype
=
'uint8'
,
lod_level
=
1
)
audio
=
fluid
.
layers
.
data
(
audio
=
fluid
.
data
(
name
=
'train_audio'
if
self
.
is_training
else
'test_audio'
,
shape
=
audio_shape
,
dtype
=
'uint8'
,
...
...
@@ -79,15 +79,15 @@ class NEXTVLAD(ModelBase):
if
self
.
mode
==
'infer'
:
label
=
None
else
:
label
=
fluid
.
layers
.
data
(
label
=
fluid
.
data
(
name
=
'train_label'
if
self
.
is_training
else
'test_label'
,
shape
=
label_shape
,
dtype
=
'float32'
)
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
self
.
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
[
rgb
,
audio
,
label
],
capacity
=
8
,
iterable
=
True
)
self
.
feature_input
=
[
rgb
,
audio
]
self
.
label_input
=
label
...
...
PaddleCV/PaddleVideo/models/nonlocal_model/nonlocal_model.py
浏览文件 @
5d166b57
...
...
@@ -18,7 +18,7 @@ import paddle.fluid as fluid
from
..model
import
ModelBase
from
.
import
resnet_video
from
.nonlocal_utils
import
load_params_from_file
from
.nonlocal_utils
import
load_p
retrain_params_from_file
,
load_weights_p
arams_from_file
import
logging
logger
=
logging
.
getLogger
(
__name__
)
...
...
@@ -39,26 +39,28 @@ class NonLocal(ModelBase):
# crop size
self
.
crop_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'crop_size'
)
def
build_input
(
self
,
use_pyreader
=
True
):
input_shape
=
[
3
,
self
.
video_length
,
self
.
crop_size
,
self
.
crop_size
]
label_shape
=
[
1
]
def
build_input
(
self
,
use_dataloader
=
True
):
input_shape
=
[
None
,
3
,
self
.
video_length
,
self
.
crop_size
,
self
.
crop_size
]
label_shape
=
[
None
,
1
]
data
=
fluid
.
layers
.
data
(
data
=
fluid
.
data
(
name
=
'train_data'
if
self
.
is_training
else
'test_data'
,
shape
=
input_shape
,
dtype
=
'float32'
)
if
self
.
mode
!=
'infer'
:
label
=
fluid
.
layers
.
data
(
label
=
fluid
.
data
(
name
=
'train_label'
if
self
.
is_training
else
'test_label'
,
shape
=
label_shape
,
dtype
=
'int64'
)
else
:
label
=
None
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
self
.
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
[
data
,
label
],
capacity
=
4
,
iterable
=
True
)
self
.
feature_input
=
[
data
]
...
...
@@ -140,20 +142,10 @@ class NonLocal(ModelBase):
)
def
load_pretrain_params
(
self
,
exe
,
pretrain
,
prog
,
place
):
load_params_from_file
(
exe
,
prog
,
pretrain
,
place
)
load_p
retrain_p
arams_from_file
(
exe
,
prog
,
pretrain
,
place
)
def
load_test_weights
(
self
,
exe
,
weights
,
prog
,
place
):
super
(
NonLocal
,
self
).
load_test_weights
(
exe
,
weights
,
prog
,
place
)
pred_w
=
fluid
.
global_scope
().
find_var
(
'pred_w'
).
get_tensor
()
pred_array
=
np
.
array
(
pred_w
)
pred_w_shape
=
pred_array
.
shape
if
len
(
pred_w_shape
)
==
2
:
logger
.
info
(
'reshape for pred_w when test'
)
pred_array
=
np
.
transpose
(
pred_array
,
(
1
,
0
))
pred_w_shape
=
pred_array
.
shape
pred_array
=
np
.
reshape
(
pred_array
,
[
pred_w_shape
[
0
],
pred_w_shape
[
1
],
1
,
1
,
1
])
pred_w
.
set
(
pred_array
.
astype
(
'float32'
),
place
)
load_weights_params_from_file
(
exe
,
prog
,
weights
,
place
)
def
get_learning_rate_decay_list
(
base_learning_rate
,
lr_decay
,
step_lists
):
...
...
PaddleCV/PaddleVideo/models/nonlocal_model/nonlocal_utils.py
浏览文件 @
5d166b57
...
...
@@ -19,58 +19,116 @@ import logging
logger
=
logging
.
getLogger
(
__name__
)
def
load_params_from_file
(
exe
,
prog
,
pretrained_file
,
place
):
logger
.
info
(
'load params from {}'
.
format
(
pretrained_file
))
def
is_parameter
(
var
):
return
isinstance
(
var
,
fluid
.
framework
.
Parameter
)
def
load_pretrain_params_from_file
(
exe
,
prog
,
pretrained_file
,
place
):
"""
The pretrined_file stores ResNet50/101 parameters pretrained on ImageNet.
However, the conv_weights of Nonlocal model is not the same as that in ResNet50/101 because the
input shape is [N, C, T, H, W] and the convolution kernels' shape is [Cout, Cin, Kt, Kh, Kw]. It is
different from the convolution kernels of ResNet whose shape is typically [Cout, Cin, Kh, Kw].
When loading conv_weights from the pretrained file, shape mismatch error will be raised due to the check
in fluid.io. This check on params' shape is newly added in fluid.version==1.6.0. So it is recommendated to
treat conv_weights specifically.
The process is as following:
1, check the params that will be loaded, those with the same name in the target program and pretrained_file.
These params will be called common params in this function.
2, Create presistable variables in the new_scope with the name of each common params. If it is the weights of
convolution, the created varibale's shape will be set to 2D-convolution-kernel type.
3, load params from the pretrained_file into those persistable variables created in the new_scope
4, get the value of common params in the new_scope and transform it if it belongs to conv weights.
5, set the value to params in the target program
"""
logger
.
info
(
'load pretrained params from {}'
.
format
(
pretrained_file
))
if
os
.
path
.
isdir
(
pretrained_file
):
param_list
=
prog
.
block
(
0
).
all_parameters
()
# get params' list in prog
param_list
=
filter
(
is_parameter
,
prog
.
list_vars
())
param_name_list
=
[
p
.
name
for
p
in
param_list
]
param_shape
=
{}
for
name
in
param_name_list
:
param_tensor
=
fluid
.
global_scope
().
find_var
(
name
).
get_tensor
()
param_shape
[
name
]
=
np
.
array
(
param_tensor
).
shape
# get all params' names in pretrained_file
param_name_from_file
=
os
.
listdir
(
pretrained_file
)
# get common params of prog and pretrained_file
# only those common params will be loaded from pretrained_file into prog
common_names
=
get_common_names
(
param_name_list
,
param_name_from_file
)
logger
.
info
(
'-------- loading params -----------'
)
# get global scope and block for prog
global_scope
=
fluid
.
global_scope
()
global_block
=
prog
.
global_block
()
# load params from file
def
is_parameter
(
var
):
if
isinstance
(
var
,
fluid
.
framework
.
Parameter
):
return
isinstance
(
var
,
fluid
.
framework
.
Parameter
)
and
\
os
.
path
.
exists
(
os
.
path
.
join
(
pretrained_file
,
var
.
name
))
# save details of common params
common_var_map
=
{}
for
name
in
common_names
:
var
=
global_block
.
var
(
name
)
var_type
=
var
.
type
var_dtype
=
var
.
dtype
var_shape
=
var
.
shape
if
len
(
var_shape
)
==
5
:
# When param is conv_weights, its shape is [Cout, Cin, Kt, Kh, Kw].
# The corresponding params in ResNet50/101 is [Cout, Cin, Kh, Kw]
var_shape2d
=
(
var_shape
[
0
],
var_shape
[
1
],
var_shape
[
3
],
var_shape
[
4
])
else
:
var_shape2d
=
var_shape
[:]
common_var_map
[
name
]
=
[
var_type
,
var_dtype
,
var_shape
,
var_shape2d
]
logger
.
info
(
"Load pretrain weights from file {}"
.
format
(
pretrained_file
))
vars
=
filter
(
is_parameter
,
prog
.
list_vars
())
fluid
.
io
.
load_vars
(
exe
,
pretrained_file
,
vars
=
vars
,
main_program
=
prog
)
# create new_scope and new_prog to create vars
cpu_place
=
fluid
.
CPUPlace
()
exe_cpu
=
fluid
.
Executor
(
cpu_place
)
new_scope
=
fluid
.
Scope
()
new_prog
=
fluid
.
Program
()
new_start_prog
=
fluid
.
Program
()
new_block
=
new_prog
.
global_block
()
# reset params if necessary
# create vars in new_scope
created_vars
=
[]
with
fluid
.
scope_guard
(
new_scope
):
with
fluid
.
program_guard
(
new_prog
,
new_start_prog
):
for
name
in
common_names
:
var_type
,
var_dtype
,
var_shape
,
var_shape2d
=
common_var_map
[
name
]
new_var
=
new_block
.
create_var
(
name
=
name
,
type
=
var_type
,
shape
=
var_shape2d
,
dtype
=
var_dtype
,
persistable
=
True
)
created_vars
.
append
(
new_var
)
# load pretrained_file into the persistable vars created in new_scope
with
fluid
.
scope_guard
(
new_scope
):
fluid
.
io
.
load_vars
(
exe_cpu
,
pretrained_file
,
main_program
=
new_prog
,
vars
=
created_vars
)
logger
.
info
(
'-------- loading params -----------'
)
for
name
in
common_names
:
t
=
fluid
.
global_scope
().
find_var
(
name
).
get_tensor
()
t_array
=
np
.
array
(
t
)
origin_shape
=
param_shape
[
name
]
if
t_array
.
shape
==
origin_shape
:
logger
.
info
(
"load param {}"
.
format
(
name
)
)
elif
(
t_array
.
shape
[:
2
]
==
origin_shape
[:
2
])
and
(
t_array
.
shape
[
-
2
:]
==
origin_shape
[
-
2
:]):
num_inflate
=
origin_shape
[
2
]
stack_t_array
=
np
.
stack
(
[
t_array
]
*
num_inflate
,
axis
=
2
)
/
float
(
num_inflate
)
assert
origin_shape
==
stack_t_array
.
shape
,
"inflated shape should be the same with tensor {}"
.
format
(
nam
e
)
t
.
set
(
stack_t
_array
.
astype
(
'float32'
),
place
)
# get the tensor of vars in new_scope
new_tensor
=
new_scope
.
var
(
name
).
get_tensor
(
)
new_value
=
np
.
array
(
new_tensor
)
prog_tensor
=
global_scope
.
var
(
name
).
get_tensor
(
)
var_type
,
var_dtype
,
var_shape
,
var_shape2d
=
common_var_map
[
name
]
# set the value of loaded vars to those with the same name in the target program
if
len
(
var_shape
)
==
5
:
# transform the loaded conv weights into the format of [Cout, Cin, Kt, Kh, Kw]
num_inflate
=
var_shape
[
2
]
stacked_array
=
np
.
stack
(
[
new_value
]
*
num_inflate
,
axis
=
2
)
/
float
(
num_inflat
e
)
prog_tensor
.
set
(
stacked
_array
.
astype
(
'float32'
),
place
)
logger
.
info
(
"load inflated({}) param {}"
.
format
(
num_inflate
,
name
))
else
:
logger
.
info
(
"Invalid case for name: {}"
.
format
(
name
))
raise
logger
.
info
(
"finished loading params from resnet pretrained model"
)
prog_tensor
.
set
(
new_value
,
place
)
logger
.
info
(
"load param {}"
.
format
(
name
))
else
:
logger
.
info
(
"pretrained file is not in a directory, not suitable to load params"
.
raise
TypeError
(
"pretrained file
{}
is not in a directory, not suitable to load params"
.
format
(
pretrained_file
))
pass
def
get_common_names
(
param_name_list
,
param_name_from_file
):
...
...
@@ -96,3 +154,89 @@ def get_common_names(param_name_list, param_name_from_file):
file_only_names
.
append
(
name
)
logger
.
info
(
name
)
return
common_names
def
load_weights_params_from_file
(
exe
,
prog
,
weights
,
place
):
"""
The params of the training process is stored in the file named weights.
However, the network of the training and test process is slightly different due to the layer
named "pred" was fc in trainng but convolution in test. When loading weights of pred (pred_w),
from the pretrained file, shape mismatch error will be raised due to the check in fluid.io.
This check on params' shape is newly added in fluid.version==1.6.0. So it is recommendated to
treat pred_w specifically.
The process is as following:
1, get the details of param_list in the target program (prog)
2, create persistable vars in new_scope with the same name as those in param_list with
the details stored in step 1. If the name is 'pred_w', the var shape should be [Cin, Cout].
3, get the value of vars in the new_scope.
If var.name is 'pred_w', transform it from fc-weights type to be consistent with convolution.
4, set the value to params in prog
"""
logger
.
info
(
'Load test weights from {}'
.
format
(
weights
))
# get the param_list in prog
prog_vars
=
filter
(
is_parameter
,
prog
.
list_vars
())
# save the details of params in prog
var_map
=
{}
for
var
in
prog_vars
:
var_name
=
var
.
name
var_type
=
var
.
type
var_dtype
=
var
.
dtype
var_shape
=
var
.
shape
# For pred_w, get the fc-weights type shape
if
var_name
==
"pred_w"
:
assert
len
(
var_shape
)
==
5
,
"pred_weights.shape shoud be [Cout, Cin, 1, 1, 1] when test"
var_shape
=
(
var_shape
[
1
],
var_shape
[
0
])
var_map
[
var_name
]
=
[
var_type
,
var_dtype
,
var_shape
]
# create new_scope and new_prog
cpu_place
=
fluid
.
CPUPlace
()
exe_cpu
=
fluid
.
Executor
(
cpu_place
)
new_scope
=
fluid
.
Scope
()
new_prog
=
fluid
.
Program
()
new_start_prog
=
fluid
.
Program
()
new_block
=
new_prog
.
global_block
()
created_vars
=
[]
# create persistable variables in new_scope
with
fluid
.
scope_guard
(
new_scope
):
with
fluid
.
program_guard
(
new_prog
,
new_start_prog
):
for
var_name
in
var_map
.
keys
():
var_type
,
var_dtype
,
var_shape
=
var_map
[
var_name
]
new_var
=
new_block
.
create_var
(
name
=
var_name
,
type
=
var_type
,
shape
=
var_shape
,
dtype
=
var_dtype
,
persistable
=
True
)
created_vars
.
append
(
new_var
)
# load params from file into the above vars created in new_scope
with
fluid
.
scope_guard
(
new_scope
):
fluid
.
io
.
load_vars
(
exe_cpu
,
''
,
main_program
=
new_prog
,
vars
=
created_vars
,
filename
=
weights
)
# get the global scope of prog
global_scope
=
fluid
.
global_scope
()
# set value of vars in new_scope to the params of prog with the same name
# and specially treat on "pred_w"
for
var_name
in
var_map
.
keys
():
global_tensor
=
global_scope
.
var
(
var_name
).
get_tensor
()
new_tensor
=
new_scope
.
var
(
var_name
).
get_tensor
()
new_value
=
np
.
array
(
new_tensor
)
if
var_name
!=
"pred_w"
:
global_tensor
.
set
(
new_value
,
place
)
else
:
pred_array
=
np
.
transpose
(
new_value
,
(
1
,
0
))
pred_array
=
np
.
reshape
(
pred_array
,
[
pred_array
.
shape
[
0
],
pred_array
.
shape
[
1
],
1
,
1
,
1
])
global_tensor
.
set
(
pred_array
.
astype
(
'float32'
),
place
)
PaddleCV/PaddleVideo/models/stnet/stnet.py
浏览文件 @
5d166b57
...
...
@@ -51,25 +51,23 @@ class STNET(ModelBase):
self
.
target_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'target_size'
)
self
.
batch_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'batch_size'
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
def
build_input
(
self
,
use_
datalo
ader
=
True
):
image_shape
=
[
3
,
self
.
target_size
,
self
.
target_size
]
image_shape
[
0
]
=
image_shape
[
0
]
*
self
.
seglen
image_shape
=
[
self
.
seg_num
]
+
image_shape
self
.
use_
pyreader
=
use_pyre
ader
image_shape
=
[
None
,
self
.
seg_num
]
+
image_shape
self
.
use_
dataloader
=
use_datalo
ader
image
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
image_shape
,
dtype
=
'float32'
)
image
=
fluid
.
data
(
name
=
'image'
,
shape
=
image_shape
,
dtype
=
'float32'
)
if
self
.
mode
!=
'infer'
:
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
'label'
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
else
:
label
=
None
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
[
image
,
label
],
capacity
=
4
,
iterable
=
True
)
self
.
py_reader
=
py_reader
self
.
feature_input
=
[
image
]
self
.
label_input
=
label
...
...
@@ -150,20 +148,76 @@ class STNET(ModelBase):
)
def
load_pretrain_params
(
self
,
exe
,
pretrain
,
prog
,
place
):
"""
The pretrained params are ResNet50 pretrained on ImageNet.
However, conv1_weights of StNet is not the same as that in ResNet50 because the input are super-image
concatanated by a series of images. When loading conv1_weights from the pretrained file, shape
mismatch error will be raised due to the check in fluid.io. This check on params' shape is newly
added in fluid.version==1.6.0. So it is recommendated to treat conv1_weights specifically.
The process is as following:
1, load params except conv1_weights from pretrain
2, create var named 'conv1_weights' in new_scope, and load the value from the pretrain file
3, get the value of conv1_weights in the new_scope and transform it
4, set the transformed value to conv1_weights in prog
"""
def
is_parameter
(
var
):
if
isinstance
(
var
,
fluid
.
framework
.
Parameter
):
return
isinstance
(
var
,
fluid
.
framework
.
Parameter
)
and
(
not
(
"fc_0"
in
var
.
name
))
\
and
(
not
(
"batch_norm"
in
var
.
name
))
and
(
not
(
"xception"
in
var
.
name
))
and
(
not
(
"conv3d"
in
var
.
name
))
and
(
not
(
"batch_norm"
in
var
.
name
))
and
(
not
(
"xception"
in
var
.
name
))
\
and
(
not
(
"conv3d"
in
var
.
name
))
and
(
not
(
"conv1_weights"
)
in
var
.
name
)
logger
.
info
(
"Load pretrain weights from {}, exclude fc, batch_norm, xception, conv3d layers."
.
"Load pretrain weights from {}, exclude
conv1,
fc, batch_norm, xception, conv3d layers."
.
format
(
pretrain
))
vars
=
filter
(
is_parameter
,
prog
.
list_vars
())
fluid
.
io
.
load_vars
(
exe
,
pretrain
,
vars
=
vars
,
main_program
=
prog
)
param_tensor
=
fluid
.
global_scope
().
find_var
(
"conv1_weights"
).
get_tensor
()
param_numpy
=
np
.
array
(
param_tensor
)
param_numpy
=
np
.
mean
(
param_numpy
,
axis
=
1
,
keepdims
=
True
)
/
self
.
seglen
# loaded params from pretrained file exclued conv1, fc, batch_norm, xception, conv3d
prog_vars
=
filter
(
is_parameter
,
prog
.
list_vars
())
fluid
.
io
.
load_vars
(
exe
,
pretrain
,
vars
=
prog_vars
,
main_program
=
prog
)
# get global scope and conv1_weights' details
global_scope
=
fluid
.
global_scope
()
global_block
=
prog
.
global_block
()
conv1_weights_name
=
"conv1_weights"
var_conv1_weights
=
global_block
.
var
(
conv1_weights_name
)
tensor_conv1_weights
=
global_scope
.
var
(
conv1_weights_name
).
get_tensor
()
var_type
=
var_conv1_weights
.
type
var_dtype
=
var_conv1_weights
.
dtype
var_shape
=
var_conv1_weights
.
shape
assert
var_shape
[
1
]
==
3
*
self
.
seglen
,
"conv1_weights.shape[1] shoud be 3 x seglen({})"
.
format
(
self
.
seglen
)
# transform shape to be consistent with conv1_weights of ResNet50
var_shape
=
(
var_shape
[
0
],
3
,
var_shape
[
2
],
var_shape
[
3
])
# create new_scope and new_prog to create var with transformed shape
cpu_place
=
fluid
.
CPUPlace
()
exe_cpu
=
fluid
.
Executor
(
cpu_place
)
new_scope
=
fluid
.
Scope
()
new_prog
=
fluid
.
Program
()
new_start_prog
=
fluid
.
Program
()
new_block
=
new_prog
.
global_block
()
with
fluid
.
scope_guard
(
new_scope
):
with
fluid
.
program_guard
(
new_prog
,
new_start_prog
):
new_var
=
new_block
.
create_var
(
name
=
conv1_weights_name
,
type
=
var_type
,
shape
=
var_shape
,
dtype
=
var_dtype
,
persistable
=
True
)
# load conv1_weights from pretrain file into the var created in new_scope
with
fluid
.
scope_guard
(
new_scope
):
fluid
.
io
.
load_vars
(
exe_cpu
,
pretrain
,
main_program
=
new_prog
,
vars
=
[
new_var
])
# get the valued of loaded conv1_weights, and transform it
new_tensor
=
new_scope
.
var
(
conv1_weights_name
).
get_tensor
()
new_value
=
np
.
array
(
new_tensor
)
param_numpy
=
np
.
mean
(
new_value
,
axis
=
1
,
keepdims
=
True
)
/
self
.
seglen
param_numpy
=
np
.
repeat
(
param_numpy
,
3
*
self
.
seglen
,
axis
=
1
)
param_tensor
.
set
(
param_numpy
.
astype
(
np
.
float32
),
place
)
# set the value of conv1_weights in the original program
tensor_conv1_weights
.
set
(
param_numpy
.
astype
(
np
.
float32
),
place
)
# All the expected pretrained params are set to prog now
PaddleCV/PaddleVideo/models/tall/README.md
0 → 100644
浏览文件 @
5d166b57
# TALL 视频查找模型
---
## 内容
-
[
模型简介
](
#模型简介
)
-
[
数据准备
](
#数据准备
)
-
[
模型训练
](
#模型训练
)
-
[
模型评估
](
#模型评估
)
-
[
模型推断
](
#模型推断
)
-
[
参考论文
](
#参考论文
)
## 模型简介
TALL是由南加州大学的Jiyang Gao等人提出的视频查找方向的经典模型。对输入的文本序列和视频片段,TALL模型利用多模态时序回归定位器(Cross-modal Temporal Regression Localizer, CTRL)联合视频信息和文本描述信息,输出位置偏置和置信度。CTRL包含四个模块:视觉编码器从视频片段中提取特征,文本编码器从语句中提取特征向量,多模态处理网络结合文本和视觉特征生成联合特征,最后时序回归网络生成置信度和偏置。
详细内容请参考
[
TALL: Temporal Activity Localization via Language Query
](
https://arxiv.org/abs/1705.02101
)
。
## 数据准备
TALL的训练数据采用TACoS数据集,数据下载及准备请参考
[
数据说明
](
../../data/dataset/tall/README.md
)
## 模型训练
数据准备完毕后,可以通过如下两种方式启动训练:
export CUDA_VISIBLE_DEVICES=0
export FLAGS_fast_eager_deletion_mode=1
export FLAGS_eager_delete_tensor_gb=0.0
export FLAGS_fraction_of_gpu_memory_to_use=0.98
python train.py --model_name=TALL \
--config=./configs/tall.yaml \
--log_interval=10 \
--valid_interval=10000 \
--use_gpu=True \
--save_dir=./data/checkpoints \
--fix_random_seed=False
bash run.sh train TALL ./configs/tall.yaml
-
从头开始训练,使用上述启动命令行或者脚本程序即可启动训练,不需要用到预训练模型
-
可下载已发布模型
[
model
](
https://paddlemodels.bj.bcebos.com/video_grounding/TALL_final.pdparams
)
通过
`--resume`
指定权重存放路径进行finetune等开发
-
模型未设置验证集,故将valid
\_
interval设为10000,在训练过程中不进行验证。
**训练策略:**
*
采用Adam优化算法训练
*
学习率为1e-3
## 模型评估
可通过如下两种方式进行模型评估:
python eval.py --model_name=TALL \
--config=./configs/tall.yaml \
--log_interval=1 \
--weights=$PATH_TO_WEIGHTS \
--use_gpu=True
bash run.sh eval TALL ./configs/tall.yaml
-
使用
`run.sh`
进行评估时,需要修改脚本中的
`weights`
参数指定需要评估的权重。
-
若未指定
`--weights`
参数,脚本会下载已发布模型
[
model
](
https://paddlemodels.bj.bcebos.com/video_grounding/TALL_final.pdparams
)
进行评估
-
运行上述程序会将测试结果打印出来,同时保存在json文件中,默认存储在data/evaluate
\_
results目录下。
-
使用CPU进行评估时,请将上面的命令行或者run.sh脚本中的
`use_gpu`
设置为False
在TACoS数据集下评估精度如下:
| R1@IOU5 | R5@IOU5 |
| :----: | :----: |
| 0.13 | 0.24 |
## 模型推断
可通过如下两种方式启动模型推断:
python predict.py --model_name=TALL \
--config=./configs/tall.yaml \
--log_interval=1 \
--weights=$PATH_TO_WEIGHTS \
--filelist=$FILELIST \
--use_gpu=True
bash run.sh predict TALL ./configs/tall.yaml
-
使用python命令行启动程序时,
`--filelist`
参数指定待推断的文件列表。用户也可参考
[
数据说明
](
../../data/dataset/tall/README.md
)
步骤二生成默认的推断文件。
`--weights`
参数为训练好的权重参数,如果不设置,程序会自动下载已训练好的权重。
-
使用
`run.sh`
进行评估时,需要修改脚本中的
`weights`
参数指定需要用到的权重。
-
若未指定
`--weights`
参数,脚本会下载已发布模型
[
model
](
https://paddlemodels.bj.bcebos.com/video_grounding/TALL_final.pdparams
)
进行推断
-
模型推断结果存储于json文件中,默认存储在
`data/dataset/predict_results`
目录下。
-
使用CPU进行推断时,请将命令行或者run.sh脚本中的
`use_gpu`
设置为False
## 参考论文
-
[
TALL: Temporal Activity Localization via Language Query
](
https://arxiv.org/abs/1705.02101
)
。
PaddleCV/PaddleVideo/models/tall/__init__.py
0 → 100644
浏览文件 @
5d166b57
from
.tall
import
*
PaddleCV/PaddleVideo/models/tall/tall.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
ParamAttr
import
numpy
as
np
from
..model
import
ModelBase
from
.
import
tall_net
import
logging
logger
=
logging
.
getLogger
(
__name__
)
__all__
=
[
"TALL"
]
class
TALL
(
ModelBase
):
"""TALL model"""
def
__init__
(
self
,
name
,
cfg
,
mode
=
'train'
):
super
(
TALL
,
self
).
__init__
(
name
,
cfg
,
mode
=
mode
)
self
.
get_config
()
def
get_config
(
self
):
self
.
visual_feature_dim
=
self
.
get_config_from_sec
(
'MODEL'
,
'visual_feature_dim'
)
self
.
sentence_embedding_size
=
self
.
get_config_from_sec
(
'MODEL'
,
'sentence_embedding_size'
)
self
.
semantic_size
=
self
.
get_config_from_sec
(
'MODEL'
,
'semantic_size'
)
self
.
hidden_size
=
self
.
get_config_from_sec
(
'MODEL'
,
'hidden_size'
)
self
.
output_size
=
self
.
get_config_from_sec
(
'MODEL'
,
'output_size'
)
self
.
batch_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'batch_size'
)
self
.
off_size
=
self
.
get_config_from_sec
(
'train'
,
'off_size'
)
# in train of yaml
self
.
clip_norm
=
self
.
get_config_from_sec
(
'train'
,
'clip_norm'
)
self
.
learning_rate
=
self
.
get_config_from_sec
(
'train'
,
'learning_rate'
)
def
build_input
(
self
,
use_dataloader
=
True
):
visual_shape
=
self
.
visual_feature_dim
sentence_shape
=
self
.
sentence_embedding_size
offset_shape
=
self
.
off_size
# set init data to None
images
=
None
sentences
=
None
offsets
=
None
self
.
use_dataloader
=
use_dataloader
images
=
fluid
.
data
(
name
=
'train_visual'
,
shape
=
[
None
,
visual_shape
],
dtype
=
'float32'
)
sentences
=
fluid
.
data
(
name
=
'train_sentences'
,
shape
=
[
None
,
sentence_shape
],
dtype
=
'float32'
)
feed_list
=
[]
feed_list
.
append
(
images
)
feed_list
.
append
(
sentences
)
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
offsets
=
fluid
.
data
(
name
=
'train_offsets'
,
shape
=
[
None
,
offset_shape
],
dtype
=
'float32'
)
feed_list
.
append
(
offsets
)
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
# input images and sentences
pass
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
if
use_dataloader
:
assert
self
.
mode
!=
'infer'
,
\
'dataloader is not recommendated when infer, please set use_dataloader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generator
(
feed_list
=
feed_list
,
capacity
=
16
,
iterable
=
True
)
self
.
images
=
[
images
]
self
.
sentences
=
sentences
self
.
offsets
=
offsets
def
create_model_args
(
self
):
cfg
=
{}
cfg
[
'semantic_size'
]
=
self
.
semantic_size
cfg
[
'sentence_embedding_size'
]
=
self
.
sentence_embedding_size
cfg
[
'hidden_size'
]
=
self
.
hidden_size
cfg
[
'output_size'
]
=
self
.
output_size
cfg
[
'batch_size'
]
=
self
.
batch_size
return
cfg
def
build_model
(
self
):
cfg
=
self
.
create_model_args
()
self
.
videomodel
=
tall_net
.
TALLNET
(
semantic_size
=
cfg
[
'semantic_size'
],
sentence_embedding_size
=
cfg
[
'sentence_embedding_size'
],
hidden_size
=
cfg
[
'hidden_size'
],
output_size
=
cfg
[
'output_size'
],
batch_size
=
cfg
[
'batch_size'
],
mode
=
self
.
mode
)
outs
=
self
.
videomodel
.
net
(
images
=
self
.
images
[
0
],
sentences
=
self
.
sentences
)
self
.
network_outputs
=
[
outs
]
def
optimizer
(
self
):
clip_norm
=
self
.
clip_norm
fluid
.
clip
.
set_gradient_clip
(
clip
=
fluid
.
clip
.
GradientClipByGlobalNorm
(
clip_norm
=
clip_norm
))
optimizer
=
fluid
.
optimizer
.
Adam
(
learning_rate
=
self
.
learning_rate
)
return
optimizer
def
loss
(
self
):
assert
self
.
mode
!=
'infer'
,
"invalid loss calculationg in infer mode"
self
.
loss_
=
self
.
videomodel
.
loss
(
self
.
network_outputs
[
0
],
self
.
offsets
)
return
self
.
loss_
def
outputs
(
self
):
preds
=
self
.
network_outputs
[
0
]
return
[
preds
]
def
feeds
(
self
):
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
return
self
.
images
+
[
self
.
sentences
,
self
.
offsets
]
elif
self
.
mode
==
'test'
or
(
self
.
mode
==
'infer'
):
return
self
.
images
+
[
self
.
sentences
]
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
def
fetches
(
self
):
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
losses
=
self
.
loss
()
fetch_list
=
[
item
for
item
in
losses
]
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
preds
=
self
.
outputs
()
fetch_list
=
[
item
for
item
in
preds
]
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
return
fetch_list
def
pretrain_info
(
self
):
return
(
None
,
None
)
def
weights_info
(
self
):
return
(
'TALL_final.pdparams'
,
'https://paddlemodels.bj.bcebos.com/video_grounding/TALL_final.pdparams'
)
PaddleCV/PaddleVideo/models/tall/tall_net.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import
paddle.fluid
as
fluid
from
paddle.fluid
import
ParamAttr
import
numpy
as
np
class
TALLNET
(
object
):
def
__init__
(
self
,
semantic_size
,
sentence_embedding_size
,
hidden_size
,
output_size
,
batch_size
,
mode
=
'train'
):
self
.
semantic_size
=
semantic_size
self
.
sentence_embedding_size
=
sentence_embedding_size
self
.
hidden_size
=
hidden_size
self
.
output_size
=
output_size
self
.
batch_size
=
batch_size
#divide train and test
self
.
mode
=
mode
def
cross_modal_comb
(
self
,
visual_feat
,
sentence_embed
):
visual_feat
=
fluid
.
layers
.
reshape
(
visual_feat
,
[
1
,
-
1
,
self
.
semantic_size
])
vv_feature
=
fluid
.
layers
.
expand
(
visual_feat
,
[
self
.
batch_size
,
1
,
1
])
sentence_embed
=
fluid
.
layers
.
reshape
(
sentence_embed
,
[
-
1
,
1
,
self
.
semantic_size
])
ss_feature
=
fluid
.
layers
.
expand
(
sentence_embed
,
[
1
,
self
.
batch_size
,
1
])
concat_feature
=
fluid
.
layers
.
concat
(
[
vv_feature
,
ss_feature
],
axis
=
2
)
#B,B,2048
mul_feature
=
vv_feature
*
ss_feature
# B,B,1024
add_feature
=
vv_feature
+
ss_feature
# B,B,1024
comb_feature
=
fluid
.
layers
.
concat
(
[
mul_feature
,
add_feature
,
concat_feature
],
axis
=
2
)
return
comb_feature
def
net
(
self
,
images
,
sentences
):
# visual2semantic
transformed_clip
=
fluid
.
layers
.
fc
(
input
=
images
,
size
=
self
.
semantic_size
,
act
=
None
,
name
=
'v2s_lt'
,
param_attr
=
fluid
.
ParamAttr
(
name
=
'v2s_lt_weights'
,
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
1.0
,
seed
=
0
)),
bias_attr
=
False
)
#l2_normalize
transformed_clip
=
fluid
.
layers
.
l2_normalize
(
x
=
transformed_clip
,
axis
=
1
)
# sentenct2semantic
transformed_sentence
=
fluid
.
layers
.
fc
(
input
=
sentences
,
size
=
self
.
semantic_size
,
act
=
None
,
name
=
's2s_lt'
,
param_attr
=
fluid
.
ParamAttr
(
name
=
's2s_lt_weights'
,
initializer
=
fluid
.
initializer
.
NormalInitializer
(
loc
=
0.0
,
scale
=
1.0
,
seed
=
0
)),
bias_attr
=
False
)
#l2_normalize
transformed_sentence
=
fluid
.
layers
.
l2_normalize
(
x
=
transformed_sentence
,
axis
=
1
)
cross_modal_vec
=
self
.
cross_modal_comb
(
transformed_clip
,
transformed_sentence
)
cross_modal_vec
=
fluid
.
layers
.
unsqueeze
(
input
=
cross_modal_vec
,
axes
=
[
0
])
cross_modal_vec
=
fluid
.
layers
.
transpose
(
cross_modal_vec
,
perm
=
[
0
,
3
,
1
,
2
])
mid_output
=
fluid
.
layers
.
conv2d
(
input
=
cross_modal_vec
,
num_filters
=
self
.
hidden_size
,
filter_size
=
1
,
stride
=
1
,
act
=
"relu"
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
name
=
"mid_out_weights"
),
bias_attr
=
False
)
sim_score_mat
=
fluid
.
layers
.
conv2d
(
input
=
mid_output
,
num_filters
=
self
.
output_size
,
filter_size
=
1
,
stride
=
1
,
act
=
None
,
param_attr
=
fluid
.
param_attr
.
ParamAttr
(
name
=
"sim_mat_weights"
),
bias_attr
=
False
)
sim_score_mat
=
fluid
.
layers
.
squeeze
(
input
=
sim_score_mat
,
axes
=
[
0
])
return
sim_score_mat
def
loss
(
self
,
outs
,
offs
):
sim_score_mat
=
outs
[
0
]
p_reg_mat
=
outs
[
1
]
l_reg_mat
=
outs
[
2
]
# loss cls, not considering iou
input_size
=
outs
.
shape
[
1
]
I
=
fluid
.
layers
.
diag
(
np
.
array
([
1
]
*
input_size
).
astype
(
'float32'
))
I_2
=
-
2
*
I
all1
=
fluid
.
layers
.
ones
(
shape
=
[
input_size
,
input_size
],
dtype
=
"float32"
)
mask_mat
=
I_2
+
all1
alpha
=
1.0
/
input_size
lambda_regression
=
0.01
batch_para_mat
=
alpha
*
all1
para_mat
=
I
+
batch_para_mat
sim_mask_mat
=
fluid
.
layers
.
exp
(
mask_mat
*
sim_score_mat
)
loss_mat
=
fluid
.
layers
.
log
(
all1
+
sim_mask_mat
)
loss_mat
=
loss_mat
*
para_mat
loss_align
=
fluid
.
layers
.
mean
(
loss_mat
)
# regression loss
reg_ones
=
fluid
.
layers
.
ones
(
shape
=
[
input_size
,
1
],
dtype
=
"float32"
)
l_reg_diag
=
fluid
.
layers
.
matmul
(
l_reg_mat
*
I
,
reg_ones
,
transpose_x
=
True
,
transpose_y
=
False
)
p_reg_diag
=
fluid
.
layers
.
matmul
(
p_reg_mat
*
I
,
reg_ones
,
transpose_x
=
True
,
transpose_y
=
False
)
offset_pred
=
fluid
.
layers
.
concat
(
input
=
[
p_reg_diag
,
l_reg_diag
],
axis
=
1
)
loss_reg
=
fluid
.
layers
.
mean
(
fluid
.
layers
.
abs
(
offset_pred
-
offs
))
# L1 loss
loss
=
lambda_regression
*
loss_reg
+
loss_align
avg_loss
=
fluid
.
layers
.
mean
(
loss
)
return
[
avg_loss
]
PaddleCV/PaddleVideo/models/tsm/tsm.py
浏览文件 @
5d166b57
...
...
@@ -51,25 +51,23 @@ class TSM(ModelBase):
self
.
target_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'target_size'
)
self
.
batch_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'batch_size'
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
def
build_input
(
self
,
use_
datalo
ader
=
True
):
image_shape
=
[
3
,
self
.
target_size
,
self
.
target_size
]
image_shape
[
0
]
=
image_shape
[
0
]
*
self
.
seglen
image_shape
=
[
self
.
seg_num
]
+
image_shape
self
.
use_
pyreader
=
use_pyre
ader
image_shape
=
[
None
,
self
.
seg_num
]
+
image_shape
self
.
use_
dataloader
=
use_datalo
ader
image
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
image_shape
,
dtype
=
'float32'
)
image
=
fluid
.
data
(
name
=
'image'
,
shape
=
image_shape
,
dtype
=
'float32'
)
if
self
.
mode
!=
'infer'
:
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
'label'
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
else
:
label
=
None
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
[
image
,
label
],
capacity
=
4
,
iterable
=
True
)
self
.
py_reader
=
py_reader
self
.
feature_input
=
[
image
]
self
.
label_input
=
label
...
...
PaddleCV/PaddleVideo/models/tsn/README.md
浏览文件 @
5d166b57
...
...
@@ -15,7 +15,7 @@
Temporal Segment Network (TSN) 是视频分类领域经典的基于2D-CNN的解决方案。该方法主要解决视频的长时间行为判断问题,通过稀疏采样视频帧的方式代替稠密采样,既能捕获视频全局信息,也能去除冗余,降低计算量。最终将每帧特征平均融合后得到视频的整体特征,并用于分类。本代码实现的模型为基于单路RGB图像的TSN网络结构,Backbone采用ResNet-50结构。
详细内容请参考ECCV 2016年论文
[
StNet:Local and Global Spatial-Temporal Modeling for Human
Action Recognition
](
https://arxiv.org/abs/1608.00859
)
详细内容请参考ECCV 2016年论文
[
Temporal Segment Networks: Towards Good Practices for Deep
Action Recognition
](
https://arxiv.org/abs/1608.00859
)
## 数据准备
...
...
PaddleCV/PaddleVideo/models/tsn/tsn.py
浏览文件 @
5d166b57
...
...
@@ -52,25 +52,23 @@ class TSN(ModelBase):
self
.
target_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'target_size'
)
self
.
batch_size
=
self
.
get_config_from_sec
(
self
.
mode
,
'batch_size'
)
def
build_input
(
self
,
use_
pyre
ader
=
True
):
def
build_input
(
self
,
use_
datalo
ader
=
True
):
image_shape
=
[
3
,
self
.
target_size
,
self
.
target_size
]
image_shape
[
0
]
=
image_shape
[
0
]
*
self
.
seglen
image_shape
=
[
self
.
seg_num
]
+
image_shape
self
.
use_
pyreader
=
use_pyre
ader
image_shape
=
[
None
,
self
.
seg_num
]
+
image_shape
self
.
use_
dataloader
=
use_datalo
ader
image
=
fluid
.
layers
.
data
(
name
=
'image'
,
shape
=
image_shape
,
dtype
=
'float32'
)
image
=
fluid
.
data
(
name
=
'image'
,
shape
=
image_shape
,
dtype
=
'float32'
)
if
self
.
mode
!=
'infer'
:
label
=
fluid
.
layers
.
data
(
name
=
'label'
,
shape
=
[
1
],
dtype
=
'int64'
)
label
=
fluid
.
data
(
name
=
'label'
,
shape
=
[
None
,
1
],
dtype
=
'int64'
)
else
:
label
=
None
if
use_
pyre
ader
:
if
use_
datalo
ader
:
assert
self
.
mode
!=
'infer'
,
\
'
pyreader is not recommendated when infer, please set use_pyre
ader to be false.'
py_reader
=
fluid
.
io
.
PyReade
r
(
'
dataloader is not recommendated when infer, please set use_datalo
ader to be false.'
self
.
dataloader
=
fluid
.
io
.
DataLoader
.
from_generato
r
(
feed_list
=
[
image
,
label
],
capacity
=
4
,
iterable
=
True
)
self
.
py_reader
=
py_reader
self
.
feature_input
=
[
image
]
self
.
label_input
=
label
...
...
PaddleCV/PaddleVideo/predict.py
浏览文件 @
5d166b57
...
...
@@ -30,6 +30,7 @@ import models
from
reader
import
get_reader
from
metrics
import
get_metrics
from
utils.utility
import
check_cuda
from
utils.utility
import
check_version
logging
.
root
.
handlers
=
[]
FORMAT
=
'[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
...
...
@@ -100,7 +101,7 @@ def infer(args):
infer_config
=
merge_configs
(
config
,
'infer'
,
vars
(
args
))
print_configs
(
infer_config
,
"Infer"
)
infer_model
=
models
.
get_model
(
args
.
model_name
,
infer_config
,
mode
=
'infer'
)
infer_model
.
build_input
(
use_
pyre
ader
=
False
)
infer_model
.
build_input
(
use_
datalo
ader
=
False
)
infer_model
.
build_model
()
infer_feeds
=
infer_model
.
feeds
()
infer_outputs
=
infer_model
.
outputs
()
...
...
@@ -108,6 +109,8 @@ def infer(args):
place
=
fluid
.
CUDAPlace
(
0
)
if
args
.
use_gpu
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
fluid
.
default_startup_program
())
filelist
=
args
.
filelist
or
infer_config
.
INFER
.
filelist
filepath
=
args
.
video_path
or
infer_config
.
INFER
.
get
(
'filepath'
,
''
)
if
filepath
!=
''
:
...
...
@@ -136,16 +139,42 @@ def infer(args):
periods
=
[]
cur_time
=
time
.
time
()
for
infer_iter
,
data
in
enumerate
(
infer_reader
()):
data_feed_in
=
[
items
[:
-
1
]
for
items
in
data
]
video_id
=
[
items
[
-
1
]
for
items
in
data
]
infer_outs
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
infer_feeder
.
feed
(
data_feed_in
))
if
args
.
model_name
==
'ETS'
:
data_feed_in
=
[
items
[:
3
]
for
items
in
data
]
vinfo
=
[
items
[
3
:]
for
items
in
data
]
video_id
=
[
items
[
0
]
for
items
in
vinfo
]
infer_outs
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
infer_feeder
.
feed
(
data_feed_in
),
return_numpy
=
False
)
infer_result_list
=
infer_outs
+
[
vinfo
]
elif
args
.
model_name
==
'TALL'
:
data_feed_in
=
[
items
[:
2
]
for
items
in
data
]
vinfo
=
[
items
[
2
:]
for
items
in
data
]
video_id
=
[
items
[
6
]
for
items
in
vinfo
]
infer_outs
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
infer_feeder
.
feed
(
data_feed_in
),
return_numpy
=
True
)
infer_result_list
=
infer_outs
+
[
vinfo
]
elif
args
.
model_name
==
'BsnPem'
:
data_feed_in
=
[
items
[:
1
]
for
items
in
data
]
vinfo
=
[
items
[
1
:]
for
items
in
data
]
video_id
=
[
items
[
2
]
for
items
in
data
]
infer_outs
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
infer_feeder
.
feed
(
data_feed_in
),
return_numpy
=
False
)
infer_result_list
=
infer_outs
+
[
vinfo
]
else
:
data_feed_in
=
[
items
[:
-
1
]
for
items
in
data
]
video_id
=
[
items
[
-
1
]
for
items
in
data
]
infer_outs
=
exe
.
run
(
fetch_list
=
fetch_list
,
feed
=
infer_feeder
.
feed
(
data_feed_in
))
infer_result_list
=
[
item
for
item
in
infer_outs
]
+
[
video_id
]
prev_time
=
cur_time
cur_time
=
time
.
time
()
period
=
cur_time
-
prev_time
periods
.
append
(
period
)
infer_result_list
=
[
item
for
item
in
infer_outs
]
+
[
video_id
]
infer_metrics
.
accumulate
(
infer_result_list
)
if
args
.
log_interval
>
0
and
infer_iter
%
args
.
log_interval
==
0
:
...
...
@@ -165,6 +194,7 @@ if __name__ == "__main__":
args
=
parse_args
()
# check whether the installed paddle is compiled with GPU
check_cuda
(
args
.
use_gpu
)
check_version
()
logger
.
info
(
args
)
infer
(
args
)
PaddleCV/PaddleVideo/reader/__init__.py
浏览文件 @
5d166b57
...
...
@@ -6,6 +6,8 @@ from .ctcn_reader import CTCNReader
from
.bmn_reader
import
BMNReader
from
.bsn_reader
import
BSNVideoReader
from
.bsn_reader
import
BSNProposalReader
from
.ets_reader
import
ETSReader
from
.tall_reader
import
TALLReader
# regist reader, sort by alphabet
regist_reader
(
"ATTENTIONCLUSTER"
,
FeatureReader
)
...
...
@@ -19,3 +21,5 @@ regist_reader("CTCN", CTCNReader)
regist_reader
(
"BMN"
,
BMNReader
)
regist_reader
(
"BSNTEM"
,
BSNVideoReader
)
regist_reader
(
"BSNPEM"
,
BSNProposalReader
)
regist_reader
(
"ETS"
,
ETSReader
)
regist_reader
(
"TALL"
,
TALLReader
)
PaddleCV/PaddleVideo/reader/bmn_reader.py
浏览文件 @
5d166b57
...
...
@@ -13,11 +13,15 @@
#limitations under the License.
import
os
import
platform
import
random
import
numpy
as
np
import
multiprocessing
import
json
import
logging
import
functools
import
paddle
logger
=
logging
.
getLogger
(
__name__
)
from
.reader_utils
import
DataReader
...
...
@@ -150,7 +154,11 @@ class BMNReader(DataReader):
if
self
.
num_threads
==
1
:
return
self
.
make_reader
()
else
:
return
self
.
make_multiprocess_reader
()
sysstr
=
platform
.
system
()
if
sysstr
==
'Windows'
:
return
self
.
make_multithread_reader
()
else
:
return
self
.
make_multiprocess_reader
()
def
make_infer_reader
(
self
):
"""reader for inference"""
...
...
@@ -196,6 +204,41 @@ class BMNReader(DataReader):
return
reader
def
make_multithread_reader
(
self
):
def
reader
():
if
self
.
mode
==
'train'
:
random
.
shuffle
(
self
.
video_list
)
for
video_name
in
self
.
video_list
:
video_idx
=
self
.
video_list
.
index
(
video_name
)
yield
[
video_name
,
video_idx
]
def
process_data
(
sample
,
mode
):
video_name
=
sample
[
0
]
video_idx
=
sample
[
1
]
video_feat
=
self
.
load_file
(
video_name
)
gt_iou_map
,
gt_start
,
gt_end
=
self
.
get_video_label
(
video_name
)
if
mode
==
'train'
or
mode
==
'valid'
:
return
(
video_feat
,
gt_iou_map
,
gt_start
,
gt_end
)
elif
mode
==
'test'
:
return
(
video_feat
,
gt_iou_map
,
gt_start
,
gt_end
,
video_idx
)
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
mode
))
mapper
=
functools
.
partial
(
process_data
,
mode
=
self
.
mode
)
def
batch_reader
():
xreader
=
paddle
.
reader
.
xmap_readers
(
mapper
,
reader
,
self
.
num_threads
,
1024
)
batch
=
[]
for
item
in
xreader
():
batch
.
append
(
item
)
if
len
(
batch
)
==
self
.
batch_size
:
yield
batch
batch
=
[]
return
batch_reader
def
make_multiprocess_reader
(
self
):
"""multiprocess reader"""
...
...
PaddleCV/PaddleVideo/reader/bsn_reader.py
浏览文件 @
5d166b57
...
...
@@ -13,12 +13,15 @@
#limitations under the License.
import
os
import
platform
import
random
import
numpy
as
np
import
pandas
as
pd
import
multiprocessing
import
json
import
logging
import
functools
import
paddle
logger
=
logging
.
getLogger
(
__name__
)
from
.reader_utils
import
DataReader
...
...
@@ -136,7 +139,11 @@ class BSNVideoReader(DataReader):
if
self
.
num_threads
==
1
:
return
self
.
make_reader
()
else
:
return
self
.
make_multiprocess_reader
()
sysstr
=
platform
.
system
()
if
sysstr
==
'Windows'
:
return
self
.
make_multithread_reader
()
else
:
return
self
.
make_multiprocess_reader
()
def
make_infer_reader
(
self
):
"""reader for inference"""
...
...
@@ -182,6 +189,42 @@ class BSNVideoReader(DataReader):
return
reader
def
make_multithread_reader
(
self
):
def
reader
():
if
self
.
mode
==
'train'
:
random
.
shuffle
(
self
.
video_list
)
for
video_name
in
self
.
video_list
:
video_idx
=
self
.
video_list
.
index
(
video_name
)
yield
[
video_name
,
video_idx
]
def
process_data
(
sample
,
mode
):
video_name
=
sample
[
0
]
video_idx
=
sample
[
1
]
video_feat
=
self
.
load_file
(
video_name
)
gt_start
,
gt_end
,
gt_action
=
self
.
get_video_label
(
video_name
)
if
mode
==
'train'
or
mode
==
'valid'
:
return
(
video_feat
,
gt_start
,
gt_end
,
gt_action
)
elif
mode
==
'test'
:
return
(
video_feat
,
gt_start
,
gt_end
,
gt_action
,
video_idx
)
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
mapper
=
functools
.
partial
(
process_data
,
mode
=
self
.
mode
)
def
batch_reader
():
xreader
=
paddle
.
reader
.
xmap_readers
(
mapper
,
reader
,
self
.
num_threads
,
1024
)
batch
=
[]
for
item
in
xreader
():
batch
.
append
(
item
)
if
len
(
batch
)
==
self
.
batch_size
:
yield
batch
batch
=
[]
return
batch_reader
def
make_multiprocess_reader
(
self
):
"""multiprocess reader"""
...
...
@@ -304,8 +347,10 @@ class BSNProposalReader(DataReader):
props_end
=
pdf
.
xmax
.
values
[:]
props_start_score
=
pdf
.
xmin_score
.
values
[:]
props_end_score
=
pdf
.
xmax_score
.
values
[:]
props_info
=
np
.
stack
(
[
props_start
,
props_end
,
props_start_score
,
props_end_score
])
props_info
=
np
.
hstack
([
props_start
[:,
np
.
newaxis
],
props_end
[:,
np
.
newaxis
],
props_start_score
[:,
np
.
newaxis
],
props_end_score
[:,
np
.
newaxis
]
])
if
self
.
mode
==
"infer"
:
return
props_info
else
:
...
...
@@ -325,7 +370,11 @@ class BSNProposalReader(DataReader):
if
self
.
num_threads
==
1
:
return
self
.
make_reader
()
else
:
return
self
.
make_multiprocess_reader
()
sysstr
=
platform
.
system
()
if
sysstr
==
'Windows'
:
return
self
.
make_multithread_reader
()
else
:
return
self
.
make_multiprocess_reader
()
def
make_infer_reader
(
self
):
"""reader for inference"""
...
...
@@ -371,6 +420,41 @@ class BSNProposalReader(DataReader):
return
reader
def
make_multithread_reader
(
self
):
def
reader
():
if
self
.
mode
==
'train'
:
random
.
shuffle
(
self
.
video_list
)
for
video_name
in
self
.
video_list
:
video_idx
=
self
.
video_list
.
index
(
video_name
)
yield
[
video_name
,
video_idx
]
def
process_data
(
sample
,
mode
):
video_name
=
sample
[
0
]
video_idx
=
sample
[
1
]
props_feat
=
self
.
load_file
(
video_name
)
props_iou
,
props_info
=
self
.
get_props
(
video_name
)
if
mode
==
'train'
or
mode
==
'valid'
:
return
(
props_feat
,
props_iou
)
elif
mode
==
'test'
:
return
(
props_feat
,
props_iou
,
props_info
,
video_idx
)
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
mode
))
mapper
=
functools
.
partial
(
process_data
,
mode
=
self
.
mode
)
def
batch_reader
():
xreader
=
paddle
.
reader
.
xmap_readers
(
mapper
,
reader
,
self
.
num_threads
,
1024
)
batch
=
[]
for
item
in
xreader
():
batch
.
append
(
item
)
if
len
(
batch
)
==
self
.
batch_size
:
yield
batch
batch
=
[]
return
batch_reader
def
make_multiprocess_reader
(
self
):
"""multiprocess reader"""
...
...
PaddleCV/PaddleVideo/reader/ets_reader.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import
os
import
random
import
sys
import
numpy
as
np
import
functools
import
paddle
import
logging
logger
=
logging
.
getLogger
(
__name__
)
import
pickle
from
.reader_utils
import
DataReader
python_ver
=
sys
.
version_info
class
ETSReader
(
DataReader
):
"""
Data reader for ETS model, which was stored as features extracted by prior networks
"""
def
__init__
(
self
,
name
,
mode
,
cfg
):
self
.
name
=
name
self
.
mode
=
mode
self
.
feat_path
=
cfg
.
MODEL
.
feat_path
self
.
dict_file
=
cfg
.
MODEL
.
dict_file
self
.
START
=
cfg
.
MODEL
.
START
self
.
END
=
cfg
.
MODEL
.
END
self
.
UNK
=
cfg
.
MODEL
.
UNK
self
.
filelist
=
cfg
[
mode
.
upper
()][
'filelist'
]
self
.
batch_size
=
cfg
[
mode
.
upper
()][
'batch_size'
]
self
.
num_threads
=
cfg
[
mode
.
upper
()][
'num_threads'
]
self
.
buffer_size
=
cfg
[
mode
.
upper
()][
'buffer_size'
]
if
(
mode
==
'test'
)
or
(
mode
==
'infer'
):
self
.
num_threads
=
1
# set num_threads as 1 for test and infer
def
load_file
(
self
):
word_dict
=
dict
()
with
open
(
self
.
dict_file
,
'r'
)
as
f
:
for
i
,
line
in
enumerate
(
f
):
word_dict
[
line
.
strip
().
split
()[
0
]]
=
i
return
word_dict
def
create_reader
(
self
):
"""reader creator for ets model"""
if
self
.
mode
==
'infer'
:
return
self
.
make_infer_reader
()
else
:
return
self
.
make_multiprocess_reader
()
def
make_infer_reader
(
self
):
"""reader for inference"""
def
reader
():
batch_out
=
[]
with
open
(
self
.
filelist
)
as
f
:
lines
=
f
.
readlines
()
reader_list
=
[
line
.
strip
()
for
line
in
lines
if
line
.
strip
()
!=
''
]
word_dict
=
self
.
load_file
()
for
line
in
reader_list
:
vid
,
stime
,
etime
,
sentence
=
line
.
split
(
'
\t
'
)
stime
,
etime
=
float
(
stime
),
float
(
etime
)
if
python_ver
<
(
3
,
0
):
datas
=
pickle
.
load
(
open
(
os
.
path
.
join
(
self
.
feat_path
,
vid
),
'rb'
))
else
:
datas
=
pickle
.
load
(
open
(
os
.
path
.
join
(
self
.
feat_path
,
vid
),
'rb'
),
encoding
=
'bytes'
)
feat
=
datas
[
int
(
stime
*
5
):
int
(
etime
*
5
+
0.5
),
:]
init_ids
=
np
.
array
([[
0
]],
dtype
=
'int64'
)
init_scores
=
np
.
array
([[
0.
]],
dtype
=
'float32'
)
if
feat
.
shape
[
0
]
==
0
:
continue
batch_out
.
append
(
(
feat
,
init_ids
,
init_scores
,
vid
,
stime
,
etime
))
if
len
(
batch_out
)
==
self
.
batch_size
:
yield
batch_out
batch_out
=
[]
return
reader
def
make_multiprocess_reader
(
self
):
"""multiprocess reader"""
def
process_data
(
sample
):
vid
,
feat
,
stime
,
etime
,
sentence
=
sample
if
self
.
mode
==
'train'
or
self
.
mode
==
'valid'
:
word_ids
=
[
word_dict
.
get
(
w
,
word_dict
[
self
.
UNK
])
for
w
in
sentence
.
split
()
]
word_ids_next
=
word_ids
+
[
word_dict
[
self
.
END
]]
word_ids
=
[
word_dict
[
self
.
START
]]
+
word_ids
return
feat
,
word_ids
,
word_ids_next
elif
self
.
mode
==
'test'
:
init_ids
=
np
.
array
([[
0
]],
dtype
=
'int64'
)
init_scores
=
np
.
array
([[
0.
]],
dtype
=
'float32'
)
return
feat
,
init_ids
,
init_scores
,
vid
,
stime
,
etime
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
def
make_reader
():
def
reader
():
lines
=
open
(
self
.
filelist
).
readlines
()
reader_list
=
[
line
.
strip
()
for
line
in
lines
if
line
.
strip
()
!=
''
]
if
self
.
mode
==
'train'
:
random
.
shuffle
(
reader_list
)
for
line
in
reader_list
:
vid
,
stime
,
etime
,
sentence
=
line
.
split
(
'
\t
'
)
stime
,
etime
=
float
(
stime
),
float
(
etime
)
if
python_ver
<
(
3
,
0
):
datas
=
pickle
.
load
(
open
(
os
.
path
.
join
(
self
.
feat_path
,
vid
),
'rb'
))
else
:
datas
=
pickle
.
load
(
open
(
os
.
path
.
join
(
self
.
feat_path
,
vid
),
'rb'
),
encoding
=
'bytes'
)
feat
=
datas
[
int
(
stime
*
5
):
int
(
etime
*
5
+
0.5
),
:]
if
feat
.
shape
[
0
]
==
0
:
continue
yield
[
vid
,
feat
,
stime
,
etime
,
sentence
]
mapper
=
functools
.
partial
(
process_data
)
return
paddle
.
reader
.
xmap_readers
(
mapper
,
reader
,
self
.
num_threads
,
self
.
buffer_size
)
def
batch_reader
():
batch_out
=
[]
for
out
in
_reader
():
batch_out
.
append
(
out
)
if
len
(
batch_out
)
==
self
.
batch_size
:
yield
batch_out
batch_out
=
[]
word_dict
=
self
.
load_file
()
_reader
=
make_reader
()
return
batch_reader
PaddleCV/PaddleVideo/reader/kinetics_reader.py
浏览文件 @
5d166b57
...
...
@@ -99,6 +99,8 @@ class KineticsReader(DataReader):
img_mean
=
self
.
img_mean
,
img_std
=
self
.
img_std
)
else
:
assert
os
.
path
.
exists
(
self
.
filelist
),
\
'{} not exist, please check the data list'
.
format
(
self
.
filelist
)
_reader
=
self
.
_reader_creator
(
self
.
filelist
,
self
.
mode
,
seg_num
=
self
.
seg_num
,
seglen
=
self
.
seglen
,
\
short_size
=
self
.
short_size
,
target_size
=
self
.
target_size
,
\
img_mean
=
self
.
img_mean
,
img_std
=
self
.
img_std
,
\
...
...
PaddleCV/PaddleVideo/reader/tall_reader.py
0 → 100644
浏览文件 @
5d166b57
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import
os
import
random
import
sys
import
numpy
as
np
import
h5py
import
multiprocessing
import
functools
import
paddle
random
.
seed
(
0
)
import
logging
logger
=
logging
.
getLogger
(
__name__
)
try
:
import
cPickle
as
pickle
except
:
import
pickle
from
.reader_utils
import
DataReader
python_ver
=
sys
.
version_info
class
TALLReader
(
DataReader
):
"""
Data reader for TALL model, which was stored as features extracted by prior networks
"""
def
__init__
(
self
,
name
,
mode
,
cfg
):
self
.
name
=
name
self
.
mode
=
mode
self
.
visual_feature_dim
=
cfg
.
MODEL
.
visual_feature_dim
self
.
movie_length_info
=
cfg
.
TRAIN
.
movie_length_info
self
.
feats_dimen
=
cfg
[
mode
.
upper
()][
'feats_dimen'
]
self
.
context_num
=
cfg
[
mode
.
upper
()][
'context_num'
]
self
.
context_size
=
cfg
[
mode
.
upper
()][
'context_size'
]
self
.
sent_vec_dim
=
cfg
[
mode
.
upper
()][
'sent_vec_dim'
]
self
.
sliding_clip_path
=
cfg
[
mode
.
upper
()][
'sliding_clip_path'
]
self
.
clip_sentvec
=
cfg
[
mode
.
upper
()][
'clip_sentvec'
]
self
.
semantic_size
=
cfg
[
mode
.
upper
()][
'semantic_size'
]
self
.
batch_size
=
cfg
[
mode
.
upper
()][
'batch_size'
]
self
.
init_data
()
def
get_context_window
(
self
,
clip_name
):
# compute left (pre) and right (post) context features based on read_unit_level_feats().
movie_name
=
clip_name
.
split
(
"_"
)[
0
]
start
=
int
(
clip_name
.
split
(
"_"
)[
1
])
end
=
int
(
clip_name
.
split
(
"_"
)[
2
].
split
(
"."
)[
0
])
clip_length
=
self
.
context_size
left_context_feats
=
np
.
zeros
(
[
self
.
context_num
,
self
.
feats_dimen
],
dtype
=
np
.
float32
)
right_context_feats
=
np
.
zeros
(
[
self
.
context_num
,
self
.
feats_dimen
],
dtype
=
np
.
float32
)
last_left_feat
=
np
.
load
(
os
.
path
.
join
(
self
.
sliding_clip_path
,
clip_name
))
last_right_feat
=
np
.
load
(
os
.
path
.
join
(
self
.
sliding_clip_path
,
clip_name
))
for
k
in
range
(
self
.
context_num
):
left_context_start
=
start
-
clip_length
*
(
k
+
1
)
left_context_end
=
start
-
clip_length
*
k
right_context_start
=
end
+
clip_length
*
k
right_context_end
=
end
+
clip_length
*
(
k
+
1
)
left_context_name
=
movie_name
+
"_"
+
str
(
left_context_start
)
+
"_"
+
str
(
left_context_end
)
+
".npy"
right_context_name
=
movie_name
+
"_"
+
str
(
right_context_start
)
+
"_"
+
str
(
right_context_end
)
+
".npy"
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
sliding_clip_path
,
left_context_name
)):
left_context_feat
=
np
.
load
(
os
.
path
.
join
(
self
.
sliding_clip_path
,
left_context_name
))
last_left_feat
=
left_context_feat
else
:
left_context_feat
=
last_left_feat
if
os
.
path
.
exists
(
os
.
path
.
join
(
self
.
sliding_clip_path
,
right_context_name
)):
right_context_feat
=
np
.
load
(
os
.
path
.
join
(
self
.
sliding_clip_path
,
right_context_name
))
last_right_feat
=
right_context_feat
else
:
right_context_feat
=
last_right_feat
left_context_feats
[
k
]
=
left_context_feat
right_context_feats
[
k
]
=
right_context_feat
return
np
.
mean
(
left_context_feats
,
axis
=
0
),
np
.
mean
(
right_context_feats
,
axis
=
0
)
def
init_data
(
self
):
def
calculate_IoU
(
i0
,
i1
):
# calculate temporal intersection over union
union
=
(
min
(
i0
[
0
],
i1
[
0
]),
max
(
i0
[
1
],
i1
[
1
]))
inter
=
(
max
(
i0
[
0
],
i1
[
0
]),
min
(
i0
[
1
],
i1
[
1
]))
iou
=
1.0
*
(
inter
[
1
]
-
inter
[
0
])
/
(
union
[
1
]
-
union
[
0
])
return
iou
def
calculate_nIoL
(
base
,
sliding_clip
):
# calculate the non Intersection part over Length ratia, make sure the input IoU is larger than 0
inter
=
(
max
(
base
[
0
],
sliding_clip
[
0
]),
min
(
base
[
1
],
sliding_clip
[
1
]))
inter_l
=
inter
[
1
]
-
inter
[
0
]
length
=
sliding_clip
[
1
]
-
sliding_clip
[
0
]
nIoL
=
1.0
*
(
length
-
inter_l
)
/
length
return
nIoL
# load file
if
(
self
.
mode
==
'train'
)
or
(
self
.
mode
==
'valid'
):
if
python_ver
<
(
3
,
0
):
cs
=
pickle
.
load
(
open
(
self
.
clip_sentvec
,
'rb'
))
movie_length_info
=
pickle
.
load
(
open
(
self
.
movie_length_info
,
'rb'
))
else
:
cs
=
pickle
.
load
(
open
(
self
.
clip_sentvec
,
'rb'
),
encoding
=
'bytes'
)
movie_length_info
=
pickle
.
load
(
open
(
self
.
movie_length_info
,
'rb'
),
encoding
=
'bytes'
)
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
if
python_ver
<
(
3
,
0
):
cs
=
pickle
.
load
(
open
(
self
.
clip_sentvec
,
'rb'
))
else
:
cs
=
pickle
.
load
(
open
(
self
.
clip_sentvec
,
'rb'
),
encoding
=
'bytes'
)
self
.
clip_sentence_pairs
=
[]
for
l
in
cs
:
clip_name
=
l
[
0
].
decode
(
'utf-8'
)
#byte object to string
sent_vecs
=
l
[
1
]
#numpy array
for
sent_vec
in
sent_vecs
:
self
.
clip_sentence_pairs
.
append
((
clip_name
,
sent_vec
))
#10146
logger
.
info
(
self
.
mode
.
upper
()
+
':'
+
str
(
len
(
self
.
clip_sentence_pairs
))
+
" clip-sentence pairs are readed"
)
movie_names_set
=
set
()
movie_clip_names
=
{}
# read groundtruth sentence-clip pairs
for
k
in
range
(
len
(
self
.
clip_sentence_pairs
)):
clip_name
=
self
.
clip_sentence_pairs
[
k
][
0
]
movie_name
=
clip_name
.
split
(
"_"
)[
0
]
if
not
movie_name
in
movie_names_set
:
movie_names_set
.
add
(
movie_name
)
movie_clip_names
[
movie_name
]
=
[]
movie_clip_names
[
movie_name
].
append
(
k
)
self
.
movie_names
=
list
(
movie_names_set
)
logger
.
info
(
self
.
mode
.
upper
()
+
':'
+
str
(
len
(
self
.
movie_names
))
+
" movies."
)
# read sliding windows, and match them with the groundtruths to make training samples
sliding_clips_tmp
=
os
.
listdir
(
self
.
sliding_clip_path
)
#161396
self
.
clip_sentence_pairs_iou
=
[]
if
self
.
mode
==
'valid'
:
# TALL model doesn't take validation during training, it will test after all the training epochs finish.
return
if
self
.
mode
==
'train'
:
for
clip_name
in
sliding_clips_tmp
:
if
clip_name
.
split
(
"."
)[
2
]
==
"npy"
:
movie_name
=
clip_name
.
split
(
"_"
)[
0
]
for
clip_sentence
in
self
.
clip_sentence_pairs
:
original_clip_name
=
clip_sentence
[
0
]
original_movie_name
=
original_clip_name
.
split
(
"_"
)[
0
]
if
original_movie_name
==
movie_name
:
start
=
int
(
clip_name
.
split
(
"_"
)[
1
])
end
=
int
(
clip_name
.
split
(
"_"
)[
2
].
split
(
"."
)[
0
])
o_start
=
int
(
original_clip_name
.
split
(
"_"
)[
1
])
o_end
=
int
(
original_clip_name
.
split
(
"_"
)[
2
].
split
(
"."
)[
0
])
iou
=
calculate_IoU
((
start
,
end
),
(
o_start
,
o_end
))
if
iou
>
0.5
:
nIoL
=
calculate_nIoL
((
o_start
,
o_end
),
(
start
,
end
))
if
nIoL
<
0.15
:
movie_length
=
movie_length_info
[
movie_name
.
split
(
"."
)[
0
].
encode
(
'utf-8'
)]
#str to byte
start_offset
=
o_start
-
start
end_offset
=
o_end
-
end
self
.
clip_sentence_pairs_iou
.
append
(
(
clip_sentence
[
0
],
clip_sentence
[
1
],
clip_name
,
start_offset
,
end_offset
))
logger
.
info
(
'TRAIN:'
+
str
(
len
(
self
.
clip_sentence_pairs_iou
))
+
" iou clip-sentence pairs are readed"
)
elif
(
self
.
mode
==
'test'
)
or
(
self
.
mode
==
'infer'
):
for
clip_name
in
sliding_clips_tmp
:
if
clip_name
.
split
(
"."
)[
2
]
==
"npy"
:
movie_name
=
clip_name
.
split
(
"_"
)[
0
]
if
movie_name
in
movie_clip_names
:
self
.
clip_sentence_pairs_iou
.
append
(
clip_name
.
split
(
"."
)[
0
]
+
"."
+
clip_name
.
split
(
"."
)
[
1
])
logger
.
info
(
'TEST:'
+
str
(
len
(
self
.
clip_sentence_pairs_iou
))
+
" iou clip-sentence pairs are readed"
)
def
load_movie_slidingclip
(
self
,
clip_sentence_pairs
,
clip_sentence_pairs_iou
,
movie_name
):
# load unit level feats and sentence vector
movie_clip_sentences
=
[]
movie_clip_featmap
=
[]
for
k
in
range
(
len
(
clip_sentence_pairs
)):
if
movie_name
in
clip_sentence_pairs
[
k
][
0
]:
movie_clip_sentences
.
append
(
(
clip_sentence_pairs
[
k
][
0
],
clip_sentence_pairs
[
k
][
1
][:
self
.
semantic_size
]))
for
k
in
range
(
len
(
clip_sentence_pairs_iou
)):
if
movie_name
in
clip_sentence_pairs_iou
[
k
]:
visual_feature_path
=
os
.
path
.
join
(
self
.
sliding_clip_path
,
clip_sentence_pairs_iou
[
k
]
+
".npy"
)
left_context_feat
,
right_context_feat
=
self
.
get_context_window
(
clip_sentence_pairs_iou
[
k
]
+
".npy"
)
feature_data
=
np
.
load
(
visual_feature_path
)
comb_feat
=
np
.
hstack
(
(
left_context_feat
,
feature_data
,
right_context_feat
))
movie_clip_featmap
.
append
(
(
clip_sentence_pairs_iou
[
k
],
comb_feat
))
return
movie_clip_featmap
,
movie_clip_sentences
def
create_reader
(
self
):
"""reader creator for ets model"""
if
self
.
mode
==
'infer'
:
return
self
.
make_infer_reader
()
else
:
return
self
.
make_reader
()
def
make_infer_reader
(
self
):
"""reader for inference"""
def
reader
():
batch_out
=
[]
idx
=
0
for
movie_name
in
self
.
movie_names
:
idx
+=
1
movie_clip_featmaps
,
movie_clip_sentences
=
self
.
load_movie_slidingclip
(
self
.
clip_sentence_pairs
,
self
.
clip_sentence_pairs_iou
,
movie_name
)
for
k
in
range
(
len
(
movie_clip_sentences
)):
sent_vec
=
movie_clip_sentences
[
k
][
1
]
sent_vec
=
np
.
reshape
(
sent_vec
,
[
1
,
sent_vec
.
shape
[
0
]])
for
t
in
range
(
len
(
movie_clip_featmaps
)):
featmap
=
movie_clip_featmaps
[
t
][
1
]
visual_clip_name
=
movie_clip_featmaps
[
t
][
0
]
start
=
float
(
visual_clip_name
.
split
(
"_"
)[
1
])
end
=
float
(
visual_clip_name
.
split
(
"_"
)[
2
].
split
(
"_"
)[
0
])
featmap
=
np
.
reshape
(
featmap
,
[
1
,
featmap
.
shape
[
0
]])
batch_out
.
append
((
featmap
,
sent_vec
,
start
,
end
,
k
,
t
,
movie_clip_sentences
,
movie_clip_featmaps
,
movie_name
))
if
len
(
batch_out
)
==
self
.
batch_size
:
yield
batch_out
batch_out
=
[]
return
reader
def
make_reader
(
self
):
def
reader
():
batch_out
=
[]
if
self
.
mode
==
'valid'
:
return
elif
self
.
mode
==
'train'
:
random
.
shuffle
(
self
.
clip_sentence_pairs_iou
)
for
clip_sentence_pair
in
self
.
clip_sentence_pairs_iou
:
offset
=
np
.
zeros
(
2
,
dtype
=
np
.
float32
)
clip_name
=
clip_sentence_pair
[
0
]
feat_path
=
os
.
path
.
join
(
self
.
sliding_clip_path
,
clip_sentence_pair
[
2
])
featmap
=
np
.
load
(
feat_path
)
left_context_feat
,
right_context_feat
=
self
.
get_context_window
(
clip_sentence_pair
[
2
])
image
=
np
.
hstack
(
(
left_context_feat
,
featmap
,
right_context_feat
))
sentence
=
clip_sentence_pair
[
1
][:
self
.
sent_vec_dim
]
p_offset
=
clip_sentence_pair
[
3
]
l_offset
=
clip_sentence_pair
[
4
]
offset
[
0
]
=
p_offset
offset
[
1
]
=
l_offset
batch_out
.
append
((
image
,
sentence
,
offset
))
if
len
(
batch_out
)
==
self
.
batch_size
:
yield
batch_out
batch_out
=
[]
elif
self
.
mode
==
'test'
:
for
movie_name
in
self
.
movie_names
:
movie_clip_featmaps
,
movie_clip_sentences
=
self
.
load_movie_slidingclip
(
self
.
clip_sentence_pairs
,
self
.
clip_sentence_pairs_iou
,
movie_name
)
for
k
in
range
(
len
(
movie_clip_sentences
)):
sent_vec
=
movie_clip_sentences
[
k
][
1
]
sent_vec
=
np
.
reshape
(
sent_vec
,
[
1
,
sent_vec
.
shape
[
0
]])
for
t
in
range
(
len
(
movie_clip_featmaps
)):
featmap
=
movie_clip_featmaps
[
t
][
1
]
visual_clip_name
=
movie_clip_featmaps
[
t
][
0
]
start
=
float
(
visual_clip_name
.
split
(
"_"
)[
1
])
end
=
float
(
visual_clip_name
.
split
(
"_"
)[
2
].
split
(
"_"
)[
0
])
featmap
=
np
.
reshape
(
featmap
,
[
1
,
featmap
.
shape
[
0
]])
batch_out
.
append
((
featmap
,
sent_vec
,
start
,
end
,
k
,
t
,
movie_clip_sentences
,
movie_clip_featmaps
,
movie_name
))
if
len
(
batch_out
)
==
self
.
batch_size
:
yield
batch_out
batch_out
=
[]
else
:
raise
NotImplementedError
(
'mode {} not implemented'
.
format
(
self
.
mode
))
return
reader
PaddleCV/PaddleVideo/run.sh
浏览文件 @
5d166b57
...
...
@@ -75,7 +75,7 @@ elif [ "$mode"x == "eval"x ]; then
elif
[
"
$mode
"
x
==
"predict"
x
]
;
then
echo
$mode
$name
$configs
$weights
if
[
"
$weights
"
x
!=
""
x
]
;
then
python
-i
predict.py
--model_name
=
$name
\
python predict.py
--model_name
=
$name
\
--config
=
$configs
\
--log_interval
=
$log_interval
\
--weights
=
$weights
\
...
...
PaddleCV/PaddleVideo/train.py
浏览文件 @
5d166b57
...
...
@@ -21,12 +21,13 @@ import logging
import
numpy
as
np
import
paddle.fluid
as
fluid
from
utils.train_utils
import
train_with_
pyre
ader
from
utils.train_utils
import
train_with_
datalo
ader
import
models
from
utils.config_utils
import
*
from
reader
import
get_reader
from
metrics
import
get_metrics
from
utils.utility
import
check_cuda
from
utils.utility
import
check_version
logging
.
root
.
handlers
=
[]
FORMAT
=
'[%(levelname)s: %(filename)s: %(lineno)4d]: %(message)s'
...
...
@@ -124,7 +125,7 @@ def train(args):
train_prog
.
random_seed
=
1000
with
fluid
.
program_guard
(
train_prog
,
startup
):
with
fluid
.
unique_name
.
guard
():
train_model
.
build_input
(
use_
pyre
ader
=
True
)
train_model
.
build_input
(
use_
datalo
ader
=
True
)
train_model
.
build_model
()
# for the input, has the form [data1, data2,..., label], so train_feeds[-1] is label
train_feeds
=
train_model
.
feeds
()
...
...
@@ -134,16 +135,16 @@ def train(args):
item
.
persistable
=
True
optimizer
=
train_model
.
optimizer
()
optimizer
.
minimize
(
train_loss
)
train_
pyreader
=
train_model
.
pyre
ader
()
train_
dataloader
=
train_model
.
datalo
ader
()
valid_prog
=
fluid
.
Program
()
with
fluid
.
program_guard
(
valid_prog
,
startup
):
with
fluid
.
unique_name
.
guard
():
valid_model
.
build_input
(
use_
pyre
ader
=
True
)
valid_model
.
build_input
(
use_
datalo
ader
=
True
)
valid_model
.
build_model
()
valid_feeds
=
valid_model
.
feeds
()
valid_fetch_list
=
valid_model
.
fetches
()
valid_
pyreader
=
valid_model
.
pyre
ader
()
valid_
dataloader
=
valid_model
.
datalo
ader
()
for
item
in
valid_fetch_list
:
item
.
persistable
=
True
...
...
@@ -172,12 +173,18 @@ def train(args):
if
args
.
model_name
in
[
'CTCN'
]:
build_strategy
.
enable_sequential_execution
=
True
exec_strategy
=
fluid
.
ExecutionStrategy
()
compiled_train_prog
=
fluid
.
compiler
.
CompiledProgram
(
train_prog
).
with_data_parallel
(
loss_name
=
train_loss
.
name
,
build_strategy
=
build_strategy
)
loss_name
=
train_loss
.
name
,
build_strategy
=
build_strategy
,
exec_strategy
=
exec_strategy
)
compiled_valid_prog
=
fluid
.
compiler
.
CompiledProgram
(
valid_prog
).
with_data_parallel
(
share_vars_from
=
compiled_train_prog
,
build_strategy
=
build_strategy
)
share_vars_from
=
compiled_train_prog
,
build_strategy
=
build_strategy
,
exec_strategy
=
exec_strategy
)
# get reader
bs_denominator
=
1
...
...
@@ -190,8 +197,8 @@ def train(args):
gpus
=
gpus
.
split
(
","
)
num_gpus
=
len
(
gpus
)
assert
num_gpus
==
train_config
.
TRAIN
.
num_gpus
,
\
"num_gpus({}) set by CUDA_VISIBLE_DEVICES"
\
"shoud be the same as that"
\
"num_gpus({}) set by CUDA_VISIBLE_DEVICES
"
\
"shoud be the same as that
"
\
"set in {}({})"
.
format
(
num_gpus
,
args
.
config
,
train_config
.
TRAIN
.
num_gpus
)
bs_denominator
=
train_config
.
TRAIN
.
num_gpus
...
...
@@ -210,16 +217,14 @@ def train(args):
epochs
=
args
.
epoch
or
train_model
.
epoch_num
()
exe_places
=
fluid
.
cuda_places
()
if
args
.
use_gpu
else
fluid
.
cpu_places
()
train_pyreader
.
decorate_sample_list_generator
(
train_reader
,
places
=
exe_places
)
valid_pyreader
.
decorate_sample_list_generator
(
valid_reader
,
places
=
exe_places
)
train_dataloader
.
set_sample_list_generator
(
train_reader
,
places
=
exe_places
)
valid_dataloader
.
set_sample_list_generator
(
valid_reader
,
places
=
exe_places
)
train_with_
pyre
ader
(
train_with_
datalo
ader
(
exe
,
train_prog
,
compiled_train_prog
,
#train_exe,
train_
pyre
ader
,
train_
datalo
ader
,
train_fetch_list
,
train_metrics
,
epochs
=
epochs
,
...
...
@@ -229,7 +234,7 @@ def train(args):
save_model_name
=
args
.
model_name
,
fix_random_seed
=
args
.
fix_random_seed
,
compiled_test_prog
=
compiled_valid_prog
,
#test_exe=valid_exe,
test_
pyreader
=
valid_pyre
ader
,
test_
dataloader
=
valid_datalo
ader
,
test_fetch_list
=
valid_fetch_list
,
test_metrics
=
valid_metrics
)
...
...
@@ -238,6 +243,7 @@ if __name__ == "__main__":
args
=
parse_args
()
# check whether the installed paddle is compiled with GPU
check_cuda
(
args
.
use_gpu
)
check_version
()
logger
.
info
(
args
)
if
not
os
.
path
.
exists
(
args
.
save_dir
):
...
...
PaddleCV/PaddleVideo/utils/train_utils.py
浏览文件 @
5d166b57
...
...
@@ -47,19 +47,19 @@ def log_lr_and_step():
logger
.
warn
(
"Unable to get learning_rate and LR_DECAY_COUNTER."
)
def
test_with_
pyre
ader
(
exe
,
compiled_test_prog
,
test_pyre
ader
,
test_fetch_list
,
test_metrics
,
log_interval
=
0
,
save_model_name
=
''
):
if
not
test_
pyre
ader
:
logger
.
error
(
"[TEST] get
pyre
ader failed."
)
def
test_with_
datalo
ader
(
exe
,
compiled_test_prog
,
test_datalo
ader
,
test_fetch_list
,
test_metrics
,
log_interval
=
0
,
save_model_name
=
''
):
if
not
test_
datalo
ader
:
logger
.
error
(
"[TEST] get
datalo
ader failed."
)
test_metrics
.
reset
()
test_iter
=
0
for
data
in
test_
pyre
ader
():
for
data
in
test_
datalo
ader
():
test_outs
=
exe
.
run
(
compiled_test_prog
,
fetch_list
=
test_fetch_list
,
feed
=
data
)
...
...
@@ -71,14 +71,14 @@ def test_with_pyreader(exe,
test_metrics
.
finalize_and_log_out
(
"[TEST] Finish"
)
def
train_with_
pyreader
(
exe
,
train_prog
,
compiled_train_prog
,
train_pyre
ader
,
\
def
train_with_
dataloader
(
exe
,
train_prog
,
compiled_train_prog
,
train_datalo
ader
,
\
train_fetch_list
,
train_metrics
,
epochs
=
10
,
\
log_interval
=
0
,
valid_interval
=
0
,
save_dir
=
'./'
,
\
save_model_name
=
'model'
,
fix_random_seed
=
False
,
\
compiled_test_prog
=
None
,
test_
pyre
ader
=
None
,
\
compiled_test_prog
=
None
,
test_
datalo
ader
=
None
,
\
test_fetch_list
=
None
,
test_metrics
=
None
):
if
not
train_
pyre
ader
:
logger
.
error
(
"[TRAIN] get
pyre
ader failed."
)
if
not
train_
datalo
ader
:
logger
.
error
(
"[TRAIN] get
datalo
ader failed."
)
epoch_periods
=
[]
train_loss
=
0
for
epoch
in
range
(
epochs
):
...
...
@@ -87,7 +87,7 @@ def train_with_pyreader(exe, train_prog, compiled_train_prog, train_pyreader, \
train_iter
=
0
epoch_periods
=
[]
for
data
in
train_
pyre
ader
():
for
data
in
train_
datalo
ader
():
cur_time
=
time
.
time
()
train_outs
=
exe
.
run
(
compiled_train_prog
,
fetch_list
=
train_fetch_list
,
...
...
@@ -122,9 +122,9 @@ def train_with_pyreader(exe, train_prog, compiled_train_prog, train_pyreader, \
save_type
=
'.pdparams'
)
if
compiled_test_prog
and
valid_interval
>
0
and
(
epoch
+
1
)
%
valid_interval
==
0
:
test_with_
pyreader
(
exe
,
compiled_test_prog
,
test_pyre
ader
,
test_fetch_list
,
test_metrics
,
log_interval
,
save_model_name
)
test_with_
dataloader
(
exe
,
compiled_test_prog
,
test_datalo
ader
,
test_fetch_list
,
test_metrics
,
log_interval
,
save_model_name
)
save_model
(
exe
,
...
...
PaddleCV/PaddleVideo/utils/utility.py
浏览文件 @
5d166b57
...
...
@@ -13,12 +13,16 @@
#limitations under the License.
import
os
import
sys
import
signal
import
logging
import
paddle
import
paddle.fluid
as
fluid
__all__
=
[
'AttrDict'
]
logger
=
logging
.
getLogger
(
__name__
)
def
_term
(
sig_num
,
addition
):
print
(
'current pid is %s, group id is %s'
%
(
os
.
getpid
(),
os
.
getpgrp
()))
...
...
@@ -49,3 +53,19 @@ def check_cuda(use_cuda, err = \
sys
.
exit
(
1
)
except
Exception
as
e
:
pass
def
check_version
():
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err
=
"PaddlePaddle version 1.6 or higher is required, "
\
"or a suitable develop version is satisfied as well.
\n
"
\
"Please make sure the version is good with your code."
\
try
:
fluid
.
require_version
(
'1.6.0'
)
except
Exception
as
e
:
logger
.
error
(
err
)
sys
.
exit
(
1
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录