From b127e4b340d294b4d160917d1e0ae5e355372a15 Mon Sep 17 00:00:00 2001 From: Jintao Lin Date: Thu, 20 Aug 2020 08:15:03 +0800 Subject: [PATCH] Update 256p performance (#132) --- configs/recognition/i3d/README.md | 5 +- configs/recognition/r2plus1d/README.md | 1 + configs/recognition/slowfast/README.md | 2 + configs/recognition/slowonly/README.md | 5 +- ...edcrop_256p_4x16x1_256e_kinetics400_rgb.py | 115 +++++++++++++++++ configs/recognition/tsm/README.md | 3 + configs/recognition/tsn/README.md | 12 +- ...alecrop_256p_1x1x3_100e_kinetics400_rgb.py | 116 ++++++++++++++++++ ...zedcrop_256p_1x1x3_100e_kinetics400_rgb.py | 115 +++++++++++++++++ ...256p_1x1x25_10crop_100e_kinetics400_rgb.py | 53 ++++++++ ..._256p_1x1x25_3crop_100e_kinetics400_rgb.py | 53 ++++++++ 11 files changed, 477 insertions(+), 3 deletions(-) create mode 100644 configs/recognition/slowonly/data_benchmark/slowonly_r50_randomresizedcrop_256p_4x16x1_256e_kinetics400_rgb.py create mode 100644 configs/recognition/tsn/data_benchmark/tsn_r50_multiscalecrop_256p_1x1x3_100e_kinetics400_rgb.py create mode 100644 configs/recognition/tsn/data_benchmark/tsn_r50_randomresizedcrop_256p_1x1x3_100e_kinetics400_rgb.py create mode 100644 configs/recognition/tsn/data_benchmark/tsn_r50_test_256p_1x1x25_10crop_100e_kinetics400_rgb.py create mode 100644 configs/recognition/tsn/data_benchmark/tsn_r50_test_256p_1x1x25_3crop_100e_kinetics400_rgb.py diff --git a/configs/recognition/i3d/README.md b/configs/recognition/i3d/README.md index f40adb1..a938739 100644 --- a/configs/recognition/i3d/README.md +++ b/configs/recognition/i3d/README.md @@ -7,8 +7,11 @@ |config | resolution | gpus | backbone |pretrain| top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json| |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| |[i3d_r50_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py) |340x256|8| ResNet50|ImageNet |72.68|90.78|1.7 (320x3 frames)| 5170|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/i3d_r50_32x2x1_100e_kinetics400_rgb_20200614-c25ef9a4.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/20200614_060456.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/20200614_060456.log.json)| +|[i3d_r50_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py) |short-side 256|8| ResNet50|ImageNet | 73.27|90.92|x|5170|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_256p_32x2x1_100e_kinetics400_rgb/i3d_r50_256p_32x2x1_100e_kinetics400_rgb_20200801-7d9f44de.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_256p_32x2x1_100e_kinetics400_rgb/20200725_031555.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_256p_32x2x1_100e_kinetics400_rgb/20200725_031555.log.json)| |[i3d_r50_dense_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb.py) |340x256|8x2| ResNet50| ImageNet|72.77|90.57|1.7 (320x3 frames)| 5170| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/i3d_r50_dense_32x2x1_100e_kinetics400_rgb_20200616-2bbb4361.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/20200616_230011.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/20200616_230011.log.json)| +|[i3d_r50_dense_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet|73.48|91.00|x| 5170| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_256p_32x2x1_100e_kinetics400_rgb/i3d_r50_dense_256p_32x2x1_100e_kinetics400_rgb_20200725-24eb54cc.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_256p_32x2x1_100e_kinetics400_rgb/20200725_031604.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_256p_32x2x1_100e_kinetics400_rgb/20200725_031604.log.json)| |[i3d_r50_fast_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb.py) |340x256|8| ResNet50 |ImageNet|72.32|90.72|1.8 (320x3 frames)| 5170| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/i3d_r50_fast_32x2x1_100e_kinetics400_rgb_20200612-000e4d2a.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/20200612_233836.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/20200612_233836.log.json)| +|[i3d_r50_fast_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet|73.24|90.99|x| 5170| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_256p_32x2x1_100e_kinetics400_rgb/i3d_r50_fast_256p_32x2x1_100e_kinetics400_rgb_20200817-4e90d1d5.pth)| [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_256p_32x2x1_100e_kinetics400_rgb/20200725_031457.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_256p_32x2x1_100e_kinetics400_rgb/20200725_031457.log.json) | Notes: 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default. @@ -44,7 +47,7 @@ Example: test I3D model on Kinetics-400 dataset and dump the result to a json fi ```shell python tools/test.py configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py \ checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \ - --out result.json + --out result.json --average-clips prob ``` For more details, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset). diff --git a/configs/recognition/r2plus1d/README.md b/configs/recognition/r2plus1d/README.md index 376ce7f..6c936f5 100644 --- a/configs/recognition/r2plus1d/README.md +++ b/configs/recognition/r2plus1d/README.md @@ -6,6 +6,7 @@ |config | resolution | gpus | backbone | pretrain| top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log| json| |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|[r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py) | short-side 256|8x4| ResNet34|None |67.30|87.65|x|5019|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb_20200729-aa94765e.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_256p_8x8x1_180e_kinetics400_rgb/20200728_021421.log.json)| |[r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py) | short-side 320|8x2| ResNet34|None |68.68|88.36|1.6 (80x3 frames)|5019|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8x1_180e_kinetics400_rgb_20200618-3fce5629.pth)| [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r21d_8x8.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8_69.58_88.36.log.json)| |[r2plus1d_r34_32x2x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py) |short-side 320|8x2| ResNet34|None |74.60|91.59|0.5 (320x3 frames)|12975| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2x1_180e_kinetics400_rgb_20200618-63462eb3.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r21d_32x2.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2_74.6_91.6.log.json)| diff --git a/configs/recognition/slowfast/README.md b/configs/recognition/slowfast/README.md index 234bb1e..f4fadc1 100644 --- a/configs/recognition/slowfast/README.md +++ b/configs/recognition/slowfast/README.md @@ -6,7 +6,9 @@ |config | resolution | gpus | backbone |pretrain| top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log| json| |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|[slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py) |short-side 256|8x4| ResNet50|None |74.75|91.73|x|6203|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb_20200728-145f1097.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200728_022505.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_256p_4x16x1_256e_kinetics400_rgb/20200728_022505.log.json)| |[slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py) |short-side 320|8x3| ResNet50|None |75.64|92.3|1.6 ((32+4)x10x3 frames)|6203|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20200704-bcde7ed7.pth)| [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/20200704_232901.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/20200704_232901.log.json)| +|[slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py) |short-side 256|8x4| ResNet50 |None |75.61|92.34|x|9062|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_256p_8x8x1_256e_kinetics400_rgb/slowfast_r50_256p_8x8x1_256e_kinetics400_rgb_20200810-863812c2.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_256p_8x8x1_256e_kinetics400_rgb/20200731_151537.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_256p_8x8x1_256e_kinetics400_rgb/20200731_151537.log.json)| |[slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py) |short-side 320|8x3| ResNet50 |None|76.94|92.8|1.3 ((32+8)x10x3 frames)|9062| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200716-73547d2b.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/20200716_192653.log.json)| Notes: diff --git a/configs/recognition/slowonly/README.md b/configs/recognition/slowonly/README.md index 572b769..5bbf6ad 100644 --- a/configs/recognition/slowonly/README.md +++ b/configs/recognition/slowonly/README.md @@ -6,18 +6,21 @@ |config | resolution | gpus | backbone |pretrain| top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M) | ckpt | log| json| |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| +|[slowonly_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb.py)|short-side 256|8x4| ResNet50 | None |72.76|90.51|x|3168|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_4x16x1_256e_kinetics400_rgb/slowonly_r50_256p_4x16x1_256e_kinetics400_rgb_20200820-bea7701f.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_4x16x1_256e_kinetics400_rgb/20200817_001411.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_4x16x1_256e_kinetics400_rgb/20200817_001411.log.json)| +|[slowonly_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb.py) |short-side 256|8x4| ResNet50 | None |74.42|91.49|x|5820|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_8x8x1_256e_kinetics400_rgb/slowonly_r50_256p_8x8x1_256e_kinetics400_rgb_20200820-75851a7d.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_8x8x1_256e_kinetics400_rgb/20200817_003320.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_8x8x1_256e_kinetics400_rgb/20200817_003320.log.json)| |[slowonly_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb.py)|short-side 320|8x2| ResNet50 | None |73.02|90.77|4.0 (40x3 frames)|3168|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb/slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth)| [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb/so_4x16.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb/slowonly_r50_4x16_73.02_90.77.log.json)| |[slowonly_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb.py) |short-side 320|8x3| ResNet50 | None |74.93|91.92|2.3 (80x3 frames)|5820| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb/slowonly_r50_8x8x1_256e_kinetics400_rgb_20200703-a79c555a.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb/so_8x8.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb/slowonly_r50_8x8_74.93_91.92.log.json)| |[slowonly_r50_4x16x1_256e_kinetics400_flow](/configs/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_flow.py)|short-side 320|8x2| ResNet50 | ImageNet |61.79|83.62|x|8450| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_flow/slowonly_r50_4x16x1_256e_kinetics400_flow_20200704-decb8568.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_flow/slowonly_r50_4x16x1_256e_kinetics400_flow_61.8_83.6.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_flow/slowonly_r50_4x16x1_256e_kinetics400_flow_61.8_83.6.log.json)| |[slowonly_r50_8x8x1_196e_kinetics400_flow](/configs/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_flow.py) |short-side 320|8x4| ResNet50 | ImageNet |65.76|86.25|x|8455| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_flow/slowonly_r50_8x8x1_256e_kinetics400_flow_20200704-6b384243.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_flow/slowonly_r50_8x8x1_196e_kinetics400_flow_65.8_86.3.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_flow/slowonly_r50_8x8x1_196e_kinetics400_flow_65.8_86.3.log.json)| ### Kinetics-400 Data Benchmark -In data benchmark, we compare two different data preprocessing methods: (1) Resize video to 340x256, (2) Resize the short edge of video to 320px. +In data benchmark, we compare two different data preprocessing methods: (1) Resize video to 340x256, (2) Resize the short edge of video to 320px, (3) Resize the short edge of video to 256px. | config | resolution | gpus | backbone | Input | pretrain | top1 acc | top5 acc | testing protocol | ckpt | log | json | | :----------------------------------------------------------- | :------------: | :--: | :------: | :---: | :------: | :------: | :------: | :----------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | | [slowonly_r50_randomresizedcrop_340x256_4x16x1_256e_kinetics400_rgb](data_benchmark/slowonly_r50_randomresizedcrop_340x256_4x16x1_256e_kinetics400_rgb.py) | 340x256 | 8x2 | ResNet50 | 4x16 | None | 71.61 | 90.05 | 10 clips x 3 crops | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/data_benchmark/slowonly_r50_randomresizedcrop_340x256_4x16x1_256e_kinetics400_rgb/slowonly_r50_randomresizedcrop_340x256_4x16x1_256e_kinetics400_rgb_20200803-dadca1a3.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/data_benchmark/slowonly_r50_randomresizedcrop_340x256_4x16x1_256e_kinetics400_rgb/slowonly_r50_randomresizedcrop_340x256_4x16x1_256e_kinetics400_rgb_20200803.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/data_benchmark/slowonly_r50_randomresizedcrop_340x256_4x16x1_256e_kinetics400_rgb/slowonly_r50_randomresizedcrop_340x256_4x16x1_256e_kinetics400_rgb_20200803.json) | | [slowonly_r50_randomresizedcrop_320p_4x16x1_256e_kinetics400_rgb](data_benchmark/slowonly_r50_randomresizedcrop_320p_4x16x1_256e_kinetics400_rgb.py) | short-side 320 | 8x2 | ResNet50 | 4x16 | None | 73.02 | 90.77 | 10 clips x 3 crops | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb/slowonly_r50_4x16x1_256e_kinetics400_rgb_20200704-a69556c6.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb/so_4x16.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_rgb/slowonly_r50_4x16_73.02_90.77.log.json) | +| [slowonly_r50_randomresizedcrop_256p_4x16x1_256e_kinetics400_rgb](data_benchmark/slowonly_r50_randomresizedcrop_256p_4x16x1_256e_kinetics400_rgb.py) | short-side 256 | 8x4 | ResNet50 | 4x16 | None | 72.76 | 90.51 | 10 clips x 3 crops | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_4x16x1_256e_kinetics400_rgb/slowonly_r50_256p_4x16x1_256e_kinetics400_rgb_20200820-bea7701f.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_4x16x1_256e_kinetics400_rgb/20200817_001411.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_256p_4x16x1_256e_kinetics400_rgb/20200817_001411.log.json) | Notes: diff --git a/configs/recognition/slowonly/data_benchmark/slowonly_r50_randomresizedcrop_256p_4x16x1_256e_kinetics400_rgb.py b/configs/recognition/slowonly/data_benchmark/slowonly_r50_randomresizedcrop_256p_4x16x1_256e_kinetics400_rgb.py new file mode 100644 index 0000000..ba5c74f --- /dev/null +++ b/configs/recognition/slowonly/data_benchmark/slowonly_r50_randomresizedcrop_256p_4x16x1_256e_kinetics400_rgb.py @@ -0,0 +1,115 @@ +model = dict( + type='Recognizer3D', + backbone=dict( + type='ResNet3dSlowOnly', + depth=50, + pretrained=None, + lateral=False, + conv1_kernel=(1, 7, 7), + conv1_stride_t=1, + pool1_stride_t=1, + inflate=(0, 0, 1, 1), + norm_eval=False), + cls_head=dict( + type='I3DHead', + in_channels=2048, + num_classes=400, + spatial_type='avg', + dropout_ratio=0.5)) +train_cfg = None +test_cfg = dict(average_clips=None) +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train_256p' +data_root_val = 'data/kinetics400/rawframes_val_256p' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes_256p.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes_256p.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes_256p.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=4, frame_interval=16, num_clips=1), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=4, + frame_interval=16, + num_clips=1, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=4, + frame_interval=16, + num_clips=10, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCTHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=16, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.6, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict(policy='CosineAnnealing', min_lr=0) +total_epochs = 256 +checkpoint_config = dict(interval=4) +workflow = [('train', 1)] +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = ('./work_dirs/slowonly_r50_randomresizedcrop_256p_4x16x1' + '_256e_kinetics400_rgb') +load_from = None +resume_from = None +find_unused_parameters = False diff --git a/configs/recognition/tsm/README.md b/configs/recognition/tsm/README.md index 20ee98f..7a8b01b 100644 --- a/configs/recognition/tsm/README.md +++ b/configs/recognition/tsm/README.md @@ -7,9 +7,12 @@ |config | resolution | gpus | backbone | pretrain | top1 acc| top5 acc | reference top1 acc | reference top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json| |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| |[tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) |340x256|8| ResNet50| ImageNet |70.24|89.56|[70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|[89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|74.0 (8x1 frames)| 7079 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json)| +|[tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |70.59|89.52|x|x|x|7079|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/tsm_r50_256p_1x1x8_50e_kinetics400_rgb_20200726-020785e2.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x8_50e_kinetics400_rgb/20200725_031623.log.json)| |[tsm_r50_video_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |70.25|89.66|[70.36](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|[89.49](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_8f.sh)|74.0 (8x1 frames)| 7077 | [ckpt]( https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_1x1x8_100e_kinetics400_rgb_20200702-a77f4328.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb/tsm_r50_video_2d_1x1x8_50e_kinetics400_rgb.log.json)| |[tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) |340x256|8x4| ResNet50 | ImageNet|72.9|90.44|[72.22](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#dense-sample)|[90.37](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#dense-sample)|11.5 (8x10 frames)| 7079 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20200626-91a54551.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log.json)| +|[tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) |short-side 256|8| ResNet50 | ImageNet|73.38|91.02|x|x|x|7079|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb_20200727-e1e0c785.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/20200725_032043.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_256p_1x1x8_100e_kinetics400_rgb/20200725_032043.log.json)| |[tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) |340x256|8| ResNet50| ImageNet |71.69|90.4|[70.67](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh)|[89.98](https://github.com/mit-han-lab/temporal-shift-module/blob/8d53d6fda40bea2f1b37a6095279c4b454d672bd/scripts/train_tsm_kinetics_rgb_16f.sh)|47.0 (16x1 frames)| 10404 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_1x1x16_50e_kinetics400_rgb_20200607-f731bffc.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20200607_221310.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20200607_221310.log.json)| +|[tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) |short-side 256|8| ResNet50| ImageNet |72.01|90.57|x|x|x|10398|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/tsm_r50_256p_1x1x16_50e_kinetics400_rgb_20200727-b414aa3c.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20200725_031232.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_256p_1x1x16_50e_kinetics400_rgb/20200725_031232.log.json)| ### Something-Something V1 diff --git a/configs/recognition/tsn/README.md b/configs/recognition/tsn/README.md index e192d37..ce371b1 100644 --- a/configs/recognition/tsn/README.md +++ b/configs/recognition/tsn/README.md @@ -13,10 +13,12 @@ |config | resolution | gpus | backbone|pretrain | top1 acc| top5 acc | reference top1 acc | reference top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json| |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:| |[tsn_r50_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) |340x256|8| ResNet50 | ImageNet|70.60|89.26|x|x|4.3 (25x10 frames)|8344| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/20200614_063526.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/20200614_063526.log.json)| +|[tsn_r50_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) |short-side 256|8| ResNet50 | ImageNet|70.42|89.03|x|x|x|8343|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/tsn_r50_256p_1x1x3_100e_kinetics400_rgb_20200725-22592236.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/20200725_031325.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/20200725_031325.log.json)| |[tsn_r50_dense_1x1x5_50e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb.py) |340x256|8x3| ResNet50| ImageNet |70.18|89.10|[69.15](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|[88.56](https://github.com/mit-han-lab/temporal-shift-module/tree/8d53d6fda40bea2f1b37a6095279c4b454d672bd#training)|12.7 (8x10 frames)|7028| [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/tsn_r50_dense_1x1x5_100e_kinetics400_rgb_20200627-a063165f.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/20200627_105310.log)| [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/20200627_105310.log.json)| |[tsn_r50_320p_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb.py) |short-side 320|8x2| ResNet50| ImageNet |70.91|89.51|x|x|10.7 (25x3 frames)| 8344 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_320p_1x1x3_100e_kinetics400_rgb_20200702-cc665e2a.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_f3_kinetics400_shortedge_70.9_89.5.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_f3_kinetics400_shortedge_70.9_89.5.log.json)| |[tsn_r50_320p_1x1x3_110e_kinetics400_flow](/configs/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow.py) |short-side 320|8x2| ResNet50 | ImageNet|55.70|79.85|x|x|x| 8471 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_320p_1x1x3_110e_kinetics400_flow_20200705-3036bab6.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_f3_kinetics400_flow_shortedge_55.7_79.9.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_f3_kinetics400_flow_shortedge_55.7_79.9.log.json)| |tsn_r50_320p_1x1x3_kinetics400_twostream [1: 1]* |x|x| ResNet50 | ImageNet|72.76|90.52| x | x | x | x | x|x|x| +|[tsn_r50_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb.py)|short-side 256|8| ResNet50| ImageNet |71.80|90.17|x|x|x|8343|[ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x8_100e_kinetics400_rgb/tsn_r50_256p_1x1x8_100e_kinetics400_rgb_20200817-883baf16.pth)|[log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x8_100e_kinetics400_rgb/20200815_173413.log)|[json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x8_100e_kinetics400_rgb/20200815_173413.log.json)| |[tsn_r50_320p_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb.py) |short-side 320|8x3| ResNet50| ImageNet |72.41|90.55|x|x|11.1 (25x3 frames)| 8344 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_320p_1x1x8_100e_kinetics400_rgb_20200702-ef80e3d7.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_f8_kinetics400_shortedge_72.4_90.6.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_f8_kinetics400_shortedge_72.4_90.6.log.json)| |[tsn_r50_320p_1x1x8_110e_kinetics400_flow](/configs/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow.py) |short-side 320|8x4| ResNet50 | ImageNet|57.76|80.99|x|x|x| 8473 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_320p_1x1x8_110e_kinetics400_flow_20200705-1f39486b.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_f8_kinetics400_flow_shortedge_57.8_81.0.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_f8_kinetics400_flow_shortedge_57.8_81.0.log.json)| |tsn_r50_320p_1x1x8_kinetics400_twostream [1: 1]* |x|x| ResNet50| ImageNet |74.64|91.77| x | x | x | x | x|x|x| @@ -28,7 +30,10 @@ Here, We use [1: 1] to indicate that we combine rgb and flow score with coeffici ### Kinetics-400 Data Benchmark (8-gpus, ResNet50, ImageNet pretrain; 3 segments) -In data benchmark, we compare: 1. Different data preprocessing methods: (1) Resize video to 340x256, (2) Resize the short edge of video to 320px; 2. Different data augmentation methods: (1) MultiScaleCrop, (2) RandomResizedCrop; 3. Different testing protocols: (1) 25 frames x 10 crops, (2) 25 frames x 3 crops. +In data benchmark, we compare: +1. Different data preprocessing methods: (1) Resize video to 340x256, (2) Resize the short edge of video to 320px, (3) Resize the short edge of video to 256px; +2. Different data augmentation methods: (1) MultiScaleCrop, (2) RandomResizedCrop; +3. Different testing protocols: (1) 25 frames x 10 crops, (2) 25 frames x 3 crops. | config | resolution | training augmentation | testing protocol | top1 acc | top5 acc | ckpt | log | json | | :----------------------------------------------------------: | :------------: | :-------------------: | :--------------: | :------: | :------: | :----------------------------------------------------------: | :----------------------------------------------------------: | :----------------------------------------------------------: | @@ -40,6 +45,11 @@ In data benchmark, we compare: 1. Different data preprocessing methods: (1) Resi | x | short-side 320 | MultiScaleCrop | 25x3 frames | 70.54 | 89.39 | x | x | x | | [tsn_r50_randomresizedcrop_320p_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/data_benchmark/tsn_r50_randomresizedcrop_320p_1x1x3_100e_kinetics400_rgb.py) | short-side 320 | RandomResizedCrop | 25x10 frames | 70.44 | 89.23 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_320p_1x1x3_100e_kinetics400_rgb_20200702-cc665e2a.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_f3_kinetics400_shortedge_70.9_89.5.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_f3_kinetics400_shortedge_70.9_89.5.log.json) | | x | short-side 320 | RandomResizedCrop | 25x3 frames | 70.91 | 89.51 | x | x | x | +| [tsn_r50_multiscalecrop_256p_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/data_benchmark/tsn_r50_multiscalecrop_256p_1x1x3_100e_kinetics400_rgb.py) | short-side 256 | MultiScaleCrop | 25x10 frames | 70.42 | 89.03 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/tsn_r50_256p_1x1x3_100e_kinetics400_rgb_20200725-22592236.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/20200725_031325.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_1x1x3_100e_kinetics400_rgb/20200725_031325.log.json)| +| x | short-side 256 | MultiScaleCrop | 25x3 frames | 70.79 | 89.42 | x | x | x | +| [tsn_r50_randomresizedcrop_256p_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/data_benchmark/tsn_r50_randomresizedcrop_256p_1x1x3_100e_kinetics400_rgb.py) | short-side 256 | RandomResizedCrop | 25x10 frames | 69.80 | 89.06 | [ckpt](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_randomresize_1x1x3_100e_kinetics400_rgb/tsn_r50_256p_randomresize_1x1x3_100e_kinetics400_rgb_20200817-ae7963ca.pth) | [log](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_randomresize_1x1x3_100e_kinetics400_rgb/20200815_172601.log) | [json](https://openmmlab.oss-accelerate.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_256p_randomresize_1x1x3_100e_kinetics400_rgb/20200815_172601.log.json)| +| x | short-side 256 | RandomResizedCrop | 25x3 frames | 70.48 | 89.89 | x | x | x | + ### Something-Something V1 diff --git a/configs/recognition/tsn/data_benchmark/tsn_r50_multiscalecrop_256p_1x1x3_100e_kinetics400_rgb.py b/configs/recognition/tsn/data_benchmark/tsn_r50_multiscalecrop_256p_1x1x3_100e_kinetics400_rgb.py new file mode 100644 index 0000000..ad7e610 --- /dev/null +++ b/configs/recognition/tsn/data_benchmark/tsn_r50_multiscalecrop_256p_1x1x3_100e_kinetics400_rgb.py @@ -0,0 +1,116 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train_256p' +data_root_val = 'data/kinetics400/rawframes_val_256p' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes_256p.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes_256p.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes_256p.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict( + type='MultiScaleCrop', + input_size=224, + scales=(1, 0.875, 0.75, 0.66), + random_crop=False, + max_wh_scale_gap=1), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=3, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=32, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001) +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[40, 80]) +total_epochs = 100 +checkpoint_config = dict(interval=5) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, hooks=[ + dict(type='TextLoggerHook'), + ]) +# runtime settings +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = ('./work_dirs/tsn_r50_multiscalecrop_256p_1x1x3' + '_100e_kinetics400_rgb/') +load_from = None +resume_from = None +workflow = [('train', 5)] diff --git a/configs/recognition/tsn/data_benchmark/tsn_r50_randomresizedcrop_256p_1x1x3_100e_kinetics400_rgb.py b/configs/recognition/tsn/data_benchmark/tsn_r50_randomresizedcrop_256p_1x1x3_100e_kinetics400_rgb.py new file mode 100644 index 0000000..f5a5ea5 --- /dev/null +++ b/configs/recognition/tsn/data_benchmark/tsn_r50_randomresizedcrop_256p_1x1x3_100e_kinetics400_rgb.py @@ -0,0 +1,115 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'RawframeDataset' +data_root = 'data/kinetics400/rawframes_train_256p' +data_root_val = 'data/kinetics400/rawframes_val_256p' +ann_file_train = 'data/kinetics400/kinetics400_train_list_rawframes_256p.txt' +ann_file_val = 'data/kinetics400/kinetics400_val_list_rawframes_256p.txt' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes_256p.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +train_pipeline = [ + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='RandomResizedCrop'), + dict(type='Resize', scale=(224, 224), keep_ratio=False), + dict(type='Flip', flip_ratio=0.5), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs', 'label']) +] +val_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=3, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='CenterCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + videos_per_gpu=32, + workers_per_gpu=4, + train=dict( + type=dataset_type, + ann_file=ann_file_train, + data_prefix=data_root, + pipeline=train_pipeline), + val=dict( + type=dataset_type, + ann_file=ann_file_val, + data_prefix=data_root_val, + pipeline=val_pipeline), + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +# optimizer +optimizer = dict( + type='SGD', lr=0.01, momentum=0.9, + weight_decay=0.0001) # this lr is used for 8 gpus +optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2)) +# learning policy +lr_config = dict(policy='step', step=[40, 80]) +total_epochs = 100 +checkpoint_config = dict(interval=5) +evaluation = dict( + interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5)) +log_config = dict( + interval=20, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook'), + ]) +# runtime settings +dist_params = dict(backend='nccl') +log_level = 'INFO' +work_dir = ('./work_dirs/tsn_r50_randomresizedcrop_256p_1x1x3' + '_100e_kinetics400_rgb/') +load_from = None +resume_from = None +workflow = [('train', 1)] diff --git a/configs/recognition/tsn/data_benchmark/tsn_r50_test_256p_1x1x25_10crop_100e_kinetics400_rgb.py b/configs/recognition/tsn/data_benchmark/tsn_r50_test_256p_1x1x25_10crop_100e_kinetics400_rgb.py new file mode 100644 index 0000000..70da19b --- /dev/null +++ b/configs/recognition/tsn/data_benchmark/tsn_r50_test_256p_1x1x25_10crop_100e_kinetics400_rgb.py @@ -0,0 +1,53 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'RawframeDataset' +data_root_val = 'data/kinetics400/rawframes_val_256p' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes_256p.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +mc_cfg = dict( + server_list_cfg='/mnt/lustre/share/memcached_client/server_list.conf', + client_cfg='/mnt/lustre/share/memcached_client/client.conf', + sys_path='/mnt/lustre/share/pymc/py3') +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='TenCrop', crop_size=224), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + workers_per_gpu=4, + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +dist_params = dict(backend='nccl') diff --git a/configs/recognition/tsn/data_benchmark/tsn_r50_test_256p_1x1x25_3crop_100e_kinetics400_rgb.py b/configs/recognition/tsn/data_benchmark/tsn_r50_test_256p_1x1x25_3crop_100e_kinetics400_rgb.py new file mode 100644 index 0000000..328d0a8 --- /dev/null +++ b/configs/recognition/tsn/data_benchmark/tsn_r50_test_256p_1x1x25_3crop_100e_kinetics400_rgb.py @@ -0,0 +1,53 @@ +# model settings +model = dict( + type='Recognizer2D', + backbone=dict( + type='ResNet', + pretrained='torchvision://resnet50', + depth=50, + norm_eval=False), + cls_head=dict( + type='TSNHead', + num_classes=400, + in_channels=2048, + spatial_type='avg', + consensus=dict(type='AvgConsensus', dim=1), + dropout_ratio=0.4, + init_std=0.01)) +# model training and testing settings +train_cfg = None +test_cfg = dict(average_clips=None) +# dataset settings +dataset_type = 'RawframeDataset' +data_root_val = 'data/kinetics400/rawframes_val_256p' +ann_file_test = 'data/kinetics400/kinetics400_val_list_rawframes_256p.txt' +img_norm_cfg = dict( + mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) +mc_cfg = dict( + server_list_cfg='/mnt/lustre/share/memcached_client/server_list.conf', + client_cfg='/mnt/lustre/share/memcached_client/client.conf', + sys_path='/mnt/lustre/share/pymc/py3') +test_pipeline = [ + dict( + type='SampleFrames', + clip_len=1, + frame_interval=1, + num_clips=25, + test_mode=True), + dict(type='RawFrameDecode'), + dict(type='Resize', scale=(-1, 256)), + dict(type='ThreeCrop', crop_size=256), + dict(type='Flip', flip_ratio=0), + dict(type='Normalize', **img_norm_cfg), + dict(type='FormatShape', input_format='NCHW'), + dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]), + dict(type='ToTensor', keys=['imgs']) +] +data = dict( + workers_per_gpu=4, + test=dict( + type=dataset_type, + ann_file=ann_file_test, + data_prefix=data_root_val, + pipeline=test_pipeline)) +dist_params = dict(backend='nccl') -- GitLab