Merge branch 'ljt/add_explain' into 'master'

Add explaination and remove redundant config See merge request open-mmlab/mmaction-lite!356

Merge branch 'ljt/add_explain' into 'master'
Add explaination and remove redundant config See merge request open-mmlab/mmaction-lite!356
b3e8fdf3 · linjintao · 65102f01 · 11d7479a · b3e8fdf3 · 65102f01
21 changed file
--- a/configs/recognition/i3d/README.md
+++ b/configs/recognition/i3d/README.md
@@ -6,11 +6,9 @@

 |config | gpus | backbone |pretrain| top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[i3d_r34_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r34_32x2x1_100e_kinetics400_rgb.py) |8| ResNet34|ImageNet |68.37|88.15|1.6 (320x3 frames)| 3176| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r34_32x2x1_100e_kinetics400_rgb/i3d_r34_32x2x1_100e_kinetics400_rgb_20200612-c883432b.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r34_32x2x1_100e_kinetics400_rgb/20200612_083439.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r34_32x2x1_100e_kinetics400_rgb/20200612_083439.log.json)|
 |[i3d_r50_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb.py) |8| ResNet50|ImageNet |72.68|90.78|1.7 (320x3 frames)| 5170|[ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/i3d_r50_32x2x1_100e_kinetics400_rgb_20200614-c25ef9a4.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/20200614_060456.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_32x2x1_100e_kinetics400_rgb/20200614_060456.log.json)|
 |[i3d_r50_dense_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb.py) |8x2| ResNet50| ImageNet|72.77|90.57|1.7 (320x3 frames)| 5170| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/i3d_r50_dense_32x2x1_100e_kinetics400_rgb_20200616-2bbb4361.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/20200616_230011.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_dense_32x2x1_100e_kinetics400_rgb/20200616_230011.log.json)|
 |[i3d_r50_fast_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb.py) |8| ResNet50 |ImageNet|72.32|90.72|1.8 (320x3 frames)| 5170| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/i3d_r50_fast_32x2x1_100e_kinetics400_rgb_20200612-000e4d2a.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/20200612_233836.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/i3d/i3d_r50_fast_32x2x1_100e_kinetics400_rgb/20200612_233836.log.json)|
-|[i3d_r50_video_3d_32x2x1_100e_kinetics400_rgb](/configs/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb.py) |8| ResNet50| ImageNet| x | x | x| x| [ckpt]() | [log]()| [json]()|

 Notes:
 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.

--- a/configs/recognition/i3d/i3d_r34_32x2x1_100e_kinetics400_rgb.py
+++ b/configs/recognition/i3d/i3d_r34_32x2x1_100e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer3D',
-    backbone=dict(
-        type='ResNet3d',
-        pretrained2d=True,
-        pretrained='torchvision://resnet34',
-        depth=34,
-        conv_cfg=dict(type='Conv3d'),
-        norm_eval=False,
-        zero_init_residual=False),
-    cls_head=dict(
-        type='I3DHead',
-        num_classes=400,
-        in_channels=512,
-        spatial_type='avg',
-        dropout_ratio=0.5,
-        init_std=0.01))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/kinetics400/rawframes_train/'
-data_root_val = 'data/kinetics400/rawframes_val/'
-ann_file_train = 'data/kinetics400/kinetics_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.8),
-        random_crop=False,
-        max_wh_scale_gap=0),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=1,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=32,
-        frame_interval=2,
-        num_clips=10,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='ThreeCrop', crop_size=256),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCTHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD', lr=0.01, momentum=0.9,
-    weight_decay=0.0001)  # this lr is used for 8 gpus
-optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[40, 80])
-total_epochs = 100
-checkpoint_config = dict(interval=5)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/i3d_r34_32x2x1_100e_kinetics400_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/r2plus1d/README.md
+++ b/configs/recognition/r2plus1d/README.md
@@ -8,7 +8,6 @@
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 |[r2plus1d_r34_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb.py) |8x2| ResNet34|None |68.68|88.36|1.6 (80x3 frames)|5019|[ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8x1_180e_kinetics400_rgb_20200618-3fce5629.pth)| [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r21d_8x8.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_8x8x1_180e_kinetics400_rgb/r2plus1d_r34_8x8_69.58_88.36.log.json)|
 |[r2plus1d_r34_32x2x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb.py) |8x2| ResNet34|None |74.60|91.59|0.5 (320x3 frames)|12975| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2x1_180e_kinetics400_rgb_20200618-63462eb3.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r21d_32x2.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/r2plus1d/r2plus1d_r34_32x2x1_180e_kinetics400_rgb/r2plus1d_r34_32x2_74.6_91.6.log.json)|
-|[r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb](/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py) |x| ResNet34|None |x|x|x|x| [ckpt]() | [log]()| [json]()|

 Notes:
 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.

--- a/configs/recognition/slowfast/README.md
+++ b/configs/recognition/slowfast/README.md
@@ -8,7 +8,6 @@
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 |[slowfast_r50_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb.py) |7x3| ResNet50|None |75.3|92.2|1.6 (320x3 frames)|6203|[ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16x1_256e_kinetics400_rgb_20200618-9a124260.pth)| [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/sf_4x16.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_4x16x1_256e_kinetics400_rgb/slowfast_r50_4x16_75.3_92.2.log.json)|
 |[slowfast_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb.py) |8x4| ResNet50 |None|76.36|92.56|1.3 (320x3 frames)|9062| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8x1_256e_kinetics400_rgb_20200619-ecd36535.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/sf_8x8.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowfast/slowfast_r50_8x8x1_256e_kinetics400_rgb/slowfast_r50_8x8_76.36_92.56.log.json)|
-|[slowfast_r50_video_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py) |x| ResNet50|None |x|x|x|x| [ckpt]() | [log]()| [json]()|

 Notes:
 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.

--- a/configs/recognition/slowonly/README.md
+++ b/configs/recognition/slowonly/README.md
@@ -10,7 +10,6 @@
 |[slowonly_r50_8x8x1_256e_kinetics400_rgb](/configs/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb.py) |8x3| ResNet50 | None |74.93|91.92|2.3 (80x3 frames)|5820| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb/slowonly_r50_8x8x1_256e_kinetics400_rgb_20200703-a79c555a.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb/so_8x8.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_rgb/slowonly_r50_8x8_74.93_91.92.log.json)|
 |[slowonly_r50_4x16x1_256e_kinetics400_flow](/configs/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_flow.py)|8x2| ResNet50  | ImageNet |61.79|83.62|x|8450| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_flow/slowonly_r50_4x16x1_256e_kinetics400_flow_20200704-decb8568.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_flow/slowonly_r50_4x16x1_256e_kinetics400_flow_61.8_83.6.log) | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_4x16x1_256e_kinetics400_flow/slowonly_r50_4x16x1_256e_kinetics400_flow_61.8_83.6.log.json)|
 |[slowonly_r50_8x8x1_196e_kinetics400_flow](/configs/recognition/slowonly/slowonly_r50_8x8x1_196e_kinetics400_flow.py) |8x4| ResNet50 | ImageNet |65.76|86.25|x|8455| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_flow/slowonly_r50_8x8x1_256e_kinetics400_flow_20200704-6b384243.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_flow/slowonly_r50_8x8x1_196e_kinetics400_flow_65.8_86.3.log) | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/slowonly/slowonly_r50_8x8x1_256e_kinetics400_flow/slowonly_r50_8x8x1_196e_kinetics400_flow_65.8_86.3.log.json)|
-|[slowonly_r50_video_4x16x1_256e_kinetics400_rgb](/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py)|x| ResNet50  | None |x|x|x|x| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)|

 Notes:
 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.

--- a/configs/recognition/tin/README.md
+++ b/configs/recognition/tin/README.md
-# TIN
-
-## Model Zoo
-
-### Kinetics-400
-
-|config | gpus | backbone| pretrain | top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json|
-|:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tin_r50_1x1x8_35e_kinetics400_rgb](/configs/recognition/tin/tin_r50_1x1x8_35e_kinetics400_rgb.py) |8| ResNet50| ImageNet |69.44|89.19|16.5 (8x1 frames)| 6173| [ckpt]() | [log]()| [json]()|
-|[tin_r50_finetune_1x1x8_35e_kinetics400_rgb](/configs/recognition/tin/tin_r50_finetune_1x1x8_35e_kinetics400_rgb.py) |8| ResNet50| ImageNet |71.00|89.98| x | 6174 | [ckpt]() | [log]()| [json]()|
-|[tin_r50_video_2d_1x1x8_35e_kinetics400_rgb](/configs/recognition/tin/tin_r50_video_1x1x8_35e_kinetics400_rgb.py) |x| ResNet50 | ImageNet | x | x | x | x | [ckpt]() | [log]()| [json]()|
-
-### Something-Something V1
-
-|config | gpus | backbone| pretrain | top1 acc| top5 acc | gpu_mem(M) | ckpt | log| json|
-|:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tin_r50_1x1x8_35e_sthv1_rgb](/configs/recognition/tin/tin_r50_1x1x8_35e_sthv1_rgb.py) |x| ResNet50 |ImageNet|41.59|71.94| x | [ckpt]() | [log]()| [json]()|
-
-### Something-Something V2
-
-|config | gpus | backbone | pretrain| top1 acc| top5 acc | gpu_mem(M)  | ckpt | log| json|
-|:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tin_r50_1x1x8_35e_sthv2_rgb](/configs/recognition/tin/tin_r50_1x1x8_35e_sthv2_rgb.py) |x| ResNet50|ImageNet |53.08|82.02| x | [ckpt]() | [log]()| [json]()|
-
-Notes:
-1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
-According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you may set the learning rate proportional to the batch size if you use different GPUs or videos per GPU,
-e.g., lr=0.01 for 4 GPUs * 2 video/gpu and lr=0.08 for 16 GPUs * 4 video/gpu.
-2. The **inference_time** is got by this [benchmark script](/tools/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time,
-not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
-
-For more details on data preparation, you can refer to Kinetics400, Something-Something V1 and Something-Something V2 in [Data Preparation](/docs/data_preparation.md).
-
-## Train
-You can use the following command to train a model.
-```shell
-python tools/train.py ${CONFIG_FILE} [optional arguments]
-```
-
-Example: train TIN model on Kinetics-400 dataset in a deterministic option with periodic validation.
-```shell
-python tools/train.py configs/recognition/tin/tin_r50_1x1x8_35e_kinetics400_rgb.py \
-    --work_dir work_dirs/tin_r50_1x1x8_35e_kinetics400_rgb \
-    --validate --seed 0 --deterministic
-```
-
-For more details, you can refer to **Training setting** part in [getting_started](/docs/getting_started.md#training-setting).
-
-## Test
-You can use the following command to test a model.
-```shell
-python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
-```
-
-Example: test TIN model on Kinetics-400 dataset and dump the result to a json file.
-```shell
-python tools/test.py configs/recognition/tin/tin_r50_1x1x8_35e_kinetics400_rgb.py \
-    checkpoints/SOME_CHECKPOINT.pth --eval top_k_accuracy mean_class_accuracy \
-    --out result.json
-```
-
-For more details, you can refer to **Test a dataset** part in [getting_started](/docs/getting_started.md#test-a-dataset).
--- a/configs/recognition/tin/tin_r50_1x1x8_35e_kinetics400_rgb.py
+++ b/configs/recognition/tin/tin_r50_1x1x8_35e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNetTIN',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False,
-        shift_div=4),
-    cls_head=dict(
-        type='TINHead',
-        num_classes=400,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.001))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/kinetics400/rawframes_train/'
-data_root_val = 'data/kinetics400/rawframes_val/'
-ann_file_train = 'data/kinetics400/kinetics_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='MultiGroupCrop', crop_size=256, groups=1),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    constructor='TSMOptimizerConstructor',
-    paramwise_cfg=dict(fc_lr5=True),
-    lr=0.005,  # this lr is used for 8 gpus
-    momentum=0.9,
-    weight_decay=0.0001)
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[10, 20, 30])
-total_epochs = 35
-checkpoint_config = dict(interval=5)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tin_r50_1x1x8_35e_kinetics400_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tin/tin_r50_1x1x8_35e_sthv1_rgb.py
+++ b/configs/recognition/tin/tin_r50_1x1x8_35e_sthv1_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNetTIN',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False,
-        shift_div=4),
-    cls_head=dict(
-        type='TINHead',
-        num_classes=174,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.001))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/sth-v1/rawframes_train/'
-data_root_val = 'data/sth-v1/rawframes_val/'
-ann_file_train = 'data/sth-v1/sth-v1_train_list.txt'
-ann_file_val = 'data/sth-v1/sth-v1_val_list.txt'
-ann_file_test = 'data/sth-v1/sth-v1_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='MultiGroupCrop', crop_size=256, groups=1),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        filename_tmpl='{:05}.jpg',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    constructor='TSMOptimizerConstructor',
-    paramwise_cfg=dict(fc_lr5=True),
-    lr=0.005,  # this lr is used for 8 gpus
-    momentum=0.9,
-    weight_decay=0.0001)
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[10, 20, 30])
-total_epochs = 35
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tin_r50_1x1x8_35e_sthv1_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tin/tin_r50_1x1x8_35e_sthv2_rgb.py
+++ b/configs/recognition/tin/tin_r50_1x1x8_35e_sthv2_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNetTIN',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False,
-        shift_div=4),
-    cls_head=dict(
-        type='TINHead',
-        num_classes=339,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.001))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/sth-v2/rawframes_train/'
-data_root_val = 'data/sth-v2/rawframes_val/'
-ann_file_train = 'data/sth-v2/sth-v2_train_list.txt'
-ann_file_val = 'data/sth-v2/sth-v2_val_list.txt'
-ann_file_test = 'data/sth-v2/sth-v2_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='MultiGroupCrop', crop_size=256, groups=1),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        filename_tmpl='{:05}.jpg',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    constructor='TSMOptimizerConstructor',
-    paramwise_cfg=dict(fc_lr5=True),
-    lr=0.005,  # this lr is used for 8 gpus
-    momentum=0.9,
-    weight_decay=0.0001)
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[10, 20, 30])
-total_epochs = 35
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tin_r50_1x1x8_35e_sthv2_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tin/tin_r50_finetune_1x1x8_35e_kinetics400_rgb.py
+++ b/configs/recognition/tin/tin_r50_finetune_1x1x8_35e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNetTIN',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False,
-        shift_div=4),
-    cls_head=dict(
-        type='TINHead',
-        num_classes=400,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.001))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/kinetics400/rawframes_train/'
-data_root_val = 'data/kinetics400/rawframes_val/'
-ann_file_train = 'data/kinetics400/kinetics_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='MultiGroupCrop', crop_size=256, groups=1),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    constructor='TSMOptimizerConstructor',
-    paramwise_cfg=dict(fc_lr5=False),
-    lr=0.005,  # this lr is used for 8 gpus
-    momentum=0.9,
-    weight_decay=0.0001)
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[10, 20, 30])
-total_epochs = 35
-checkpoint_config = dict(interval=5)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tin_r50_finetune_1x1x8_35e_kinetics400_rgb/'
-load_from = './modelzoo/kinetics400_tsm_ckpt_lite_for_finetune.pth'
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tin/tin_r50_video_1x1x8_35e_kinetics400_rgb.py
+++ b/configs/recognition/tin/tin_r50_video_1x1x8_35e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNetTIN',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False,
-        shift_div=4),
-    cls_head=dict(
-        type='TINHead',
-        num_classes=400,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.001))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 's3://lizz.ssd/datasets/kinetics400_256/'
-data_root_val = 's3://lizz.ssd/datasets/kinetics400_256/'
-ann_file_train = 'data/kinetics400/k400_train.txt'
-ann_file_val = 'data/kinetics400/k400_val.txt'
-ann_file_test = 'data/kinetics400/k400_val.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='MultiGroupCrop', crop_size=256, groups=1),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=6,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    constructor='TSMOptimizerConstructor',
-    paramwise_cfg=dict(fc_lr5=True),
-    lr=0.005,  # this lr is used for 8 gpus
-    momentum=0.9,
-    weight_decay=0.0001)
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[10, 20, 30])
-total_epochs = 35
-checkpoint_config = dict(interval=5)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tin_r50_video_2d_1x1x8_35e_kinetics400_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tsm/README.md
+++ b/configs/recognition/tsm/README.md
@@ -6,26 +6,22 @@

 |config | gpus | backbone | pretrain | top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tsm_r50_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) |8| ResNet50| ImageNet |70.24|89.56|74.0 (8x1 frames)| 7079 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json)|
-|[tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) |8x4| ResNet50 | ImageNet|71.84|90.18|11.5 (8x10 frames)| 7079 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20200626-91a54551.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log.json)|
-|[tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) |8| ResNet50| ImageNet |71.69|90.4|47.0 (16x1 frames)| 10404  | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_1x1x16_50e_kinetics400_rgb_20200607-f731bffc.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20200607_221310.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20200607_221310.log.json)|
-|[tsm_r50_video_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_video_1x1x8_100e_kinetics400_rgb.py) |x| ResNet50| ImageNet | x | x | x | 7077 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)|
+|[tsm_r50_1x1x8_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb.py) |8| ResNet50| ImageNet |70.24 (70.36)|89.56 (89.49)|74.0 (8x1 frames)| 7079 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/tsm_r50_1x1x8_50e_kinetics400_rgb_20200607-af7fb746.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_kinetics400_rgb/20200607_211800.log.json)|
+|[tsm_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb.py) |8x4| ResNet50 | ImageNet|72.9 (72.22)|90.44 (90.37)|11.5 (8x10 frames)| 7079 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/tsm_r50_dense_1x1x8_100e_kinetics400_rgb_20200626-91a54551.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_dense_1x1x8_100e_kinetics400_rgb/20200626_213415.log.json)|
+|[tsm_r50_1x1x16_50e_kinetics400_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb.py) |8| ResNet50| ImageNet |71.69 (70.67)|90.4 (89.98)|47.0 (16x1 frames)| 10404  | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/tsm_r50_1x1x16_50e_kinetics400_rgb_20200607-f731bffc.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20200607_221310.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_kinetics400_rgb/20200607_221310.log.json)|

 ### Something-Something V1

 |config | gpus | backbone| pretrain | top1 acc| top5 acc | gpu_mem(M)  | ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py) |8| ResNet50 | ImageNet|44.62|75.51| 7077| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20200616-3417f361.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20200616_022852.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20200616_022852.log.json)|
-|[tsm_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py) |x| ResNet50 | ImageNet|43.81|74.73| x | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)|
-|[tsm_r101_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py) |x| ResNet101| ImageNet |46.41|74.07| x | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)|
+|[tsm_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb.py) |8| ResNet50 | ImageNet|44.62 (42.08)|75.51 (72.66)| 7077| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/tsm_r50_1x1x8_50e_sthv1_rgb_20200616-3417f361.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20200616_022852.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x8_50e_sthv1_rgb/20200616_022852.log.json)|

 ### Something-Something V2

 |config | gpus | backbone | pretrain| top1 acc| top5 acc | gpu_mem(M)  | ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tsm_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py) |x| ResNet50| ImageNet |59.91|84.61| x| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)|
-|[tsm_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py) |8| ResNet50| ImageNet |56.10|84.43| 10400| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_1x1x16_50e_sthv2_rgb_20200621-60ff441a.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20200621_101921.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20200621_101921.log.json)|
-|[tsm_r101_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py) |8| ResNet101 | ImageNet|59.12|85.74| 9784 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_1x1x8_50e_sthv2_rgb_20200625-df82f5e6.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20200625_224131.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20200625_224131.log.json)|
+|[tsm_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb.py) |8| ResNet50| ImageNet |57.68 (56.57)|83.65 (84.30)| 10400| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/tsm_r50_1x1x16_50e_sthv2_rgb_20200621-60ff441a.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20200621_101921.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r50_1x1x16_50e_sthv2_rgb/20200621_101921.log.json)|
+|[tsm_r101_1x1x8_50e_sthv2_rgb](/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb.py) |8| ResNet101 | ImageNet|59.12 (59.20)|85.74 (85.27)| 9784 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/tsm_r101_1x1x8_50e_sthv2_rgb_20200625-df82f5e6.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20200625_224131.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsm/tsm_r101_1x1x8_50e_sthv2_rgb/20200625_224131.log.json)|

 Notes:
 1. The **gpus** indicates the number of gpu we used to get the checkpoint. It is noteworthy that the configs we provide are used for 8 gpus as default.
@@ -33,6 +29,7 @@ According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you ma
 e.g., lr=0.01 for 4 GPUs * 2 video/gpu and lr=0.08 for 16 GPUs * 4 video/gpu.
 2. The **inference_time** is got by this [benchmark script](/tools/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time,
 not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
+3. The values in brackets are the results got by training on the [original repo](https://github.com/mit-han-lab/temporal-shift-module), using the same model settings.

 For more details on data preparation, you can refer to Kinetics400, Something-Something V1 and Something-Something V2 in [Data Preparation](/docs/data_preparation.md).


--- a/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py
+++ b/configs/recognition/tsm/tsm_r101_1x1x8_50e_sthv1_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNetTSM',
-        pretrained='torchvision://resnet101',
-        depth=101,
-        norm_eval=False,
-        shift_div=8),
-    cls_head=dict(
-        type='TSMHead',
-        num_classes=174,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.001,
-        is_shift=True))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/sth-v1/rawframes_train/'
-data_root_val = 'data/sth-v1/rawframes_val/'
-ann_file_train = 'data/sth-v1/sth-v1_train_list.txt'
-ann_file_val = 'data/sth-v1/sth-v1_val_list.txt'
-ann_file_test = 'data/sth-v1/sth-v1_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1,
-        num_fixed_crops=13),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        filename_tmpl='{:05}.jpg',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    constructor='TSMOptimizerConstructor',
-    paramwise_cfg=dict(fc_lr5=True),
-    lr=0.01,  # this lr is used for 8 gpus
-    momentum=0.9,
-    weight_decay=0.0005)
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[20, 40])
-total_epochs = 50
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tsm_r101_1x1x8_50e_sthv1_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py
+++ b/configs/recognition/tsm/tsm_r50_1x1x16_50e_sthv1_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNetTSM',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False,
-        shift_div=8),
-    cls_head=dict(
-        type='TSMHead',
-        num_classes=174,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.001,
-        is_shift=True))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/sth-v1/rawframes_train/'
-data_root_val = 'data/sth-v1/rawframes_val/'
-ann_file_train = 'data/sth-v1/sth-v1_train_list.txt'
-ann_file_val = 'data/sth-v1/sth-v1_val_list.txt'
-ann_file_test = 'data/sth-v1/sth-v1_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=16),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1,
-        num_fixed_crops=13),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=16,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=16,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=4,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        filename_tmpl='{:05}.jpg',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    constructor='TSMOptimizerConstructor',
-    paramwise_cfg=dict(fc_lr5=True),
-    lr=0.005,  # this lr is used for 8 gpus
-    momentum=0.9,
-    weight_decay=0.0005)
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[20, 40])
-total_epochs = 50
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tsm_r50_1x1x16_50e_sthv1_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py
+++ b/configs/recognition/tsm/tsm_r50_1x1x8_50e_sthv2_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNetTSM',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False,
-        shift_div=8),
-    cls_head=dict(
-        type='TSMHead',
-        num_classes=339,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.001,
-        is_shift=True))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/sth-v2/rawframes_train/'
-data_root_val = 'data/sth-v2/rawframes_val/'
-ann_file_train = 'data/sth-v2/sth-v2_train_list.txt'
-ann_file_val = 'data/sth-v2/sth-v2_val_list.txt'
-ann_file_test = 'data/sth-v2/sth-v2_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1,
-        num_fixed_crops=13),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=8,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        filename_tmpl='{:05}.jpg',
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        filename_tmpl='{:05}.jpg',
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD',
-    constructor='TSMOptimizerConstructor',
-    paramwise_cfg=dict(fc_lr5=True),
-    lr=0.01,  # this lr is used for 8 gpus
-    momentum=0.9,
-    weight_decay=0.0005)
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[20, 40])
-total_epochs = 50
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tsm_r50_1x1x8_50e_sthv2_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tsn/README.md
+++ b/configs/recognition/tsn/README.md
@@ -6,27 +6,23 @@

 |config | gpus | backbone | pretrain | top1 acc| top5 acc | gpu_mem(M) | ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tsn_r50_1x1x3_100e_ucf101_rgb](/configs/recognition/tsn/tsn_r50_1x1x3_80e_ucf101_rgb.py) |8| ResNet50 | ImageNet |80.12|96.09|8332| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_80e_ucf101_rgb/tsn_r50_1x1x3_80e_ucf101_rgb_20200613-d6ad9c48.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_80e_ucf101_rgb/20200613_020013.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_80e_ucf101_rgb/20200613_020013.log.json)|
+|[tsn_r50_1x1x3_80e_ucf101_rgb](/configs/recognition/tsn/tsn_r50_1x1x3_80e_ucf101_rgb.py) |8| ResNet50 | ImageNet |80.12|96.09|8332| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_80e_ucf101_rgb/tsn_r50_1x1x3_80e_ucf101_rgb_20200613-d6ad9c48.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_80e_ucf101_rgb/20200613_020013.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_80e_ucf101_rgb/20200613_020013.log.json)|

 ### Kinetics-400

 |config | gpus | backbone|pretrain | top1 acc| top5 acc | inference_time(video/s) | gpu_mem(M)| ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
 |[tsn_r50_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb.py) |8| ResNet50 | ImageNet|70.60|89.26|4.3 (25x10 frames)|8344| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/tsn_r50_1x1x3_100e_kinetics400_rgb_20200614-e508be42.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/20200614_063526.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x3_100e_kinetics400_rgb/20200614_063526.log.json)|
-|[tsn_r50_1x1x5_50e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_1x1x5_50e_kinetics400_rgb.py) |8| ResNet50| ImageNet |68.64|88.19|86.7 (8x1 frames)|7031| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x5_50e_kinetics400_rgb/tsn_r50_1x1x5_50e_kinetics400_rgb_20200608-058a82c3.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x5_50e_kinetics400_rgb/20200608_161221.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x5_50e_kinetics400_rgb/20200608_161221.log.json)|
-|[tsn_r50_dense_1x1x5_50e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb.py) |8x3| ResNet50| ImageNet |68.59|88.31|12.7 (8x10 frames)|7028| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/tsn_r50_dense_1x1x5_100e_kinetics400_rgb_20200627-a063165f.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/20200627_105310.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/20200627_105310.log.json)|
-|[tsn_r50_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_1x1x8_100e_kinetics400_rgb.py) |x| ResNet50| ImageNet |69.41|88.37|81.6 (8x1 frames)| x | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)|
+|[tsn_r50_dense_1x1x5_50e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb.py) |8x3| ResNet50| ImageNet |70.18 (69.15)|89.10 (88.56)|12.7 (8x10 frames)|7028| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/tsn_r50_dense_1x1x5_100e_kinetics400_rgb_20200627-a063165f.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/20200627_105310.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x5_100e_kinetics400_rgb/20200627_105310.log.json)|
 |[tsn_r50_320p_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb.py) |8x2| ResNet50| ImageNet |70.91|89.51|10.7 (25x3 frames)| 8344 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_320p_1x1x3_100e_kinetics400_rgb_20200702-cc665e2a.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_f3_kinetics400_shortedge_70.9_89.5.log) | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_100e_kinetics400_rgb/tsn_r50_f3_kinetics400_shortedge_70.9_89.5.log.json)|
 |[tsn_r50_320p_1x1x3_110e_kinetics400_flow](/configs/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow.py) |8x2| ResNet50 | ImageNet|55.70|79.85|x| 8471 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_320p_1x1x3_110e_kinetics400_flow_20200705-3036bab6.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_f3_kinetics400_flow_shortedge_55.7_79.9.log) | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x3_110e_kinetics400_flow/tsn_r50_f3_kinetics400_flow_shortedge_55.7_79.9.log.json)|
-|[tsn_r50_320p_1x1x3_kinetics400_twostream] [1: 1]* |x| ResNet50 | ImageNet|72.76|90.52| x | x | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)  | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)|
+|tsn_r50_320p_1x1x3_kinetics400_twostream [1: 1]* |x| ResNet50 | ImageNet|72.76|90.52| x | x | x | x  | x|
 |[tsn_r50_320p_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb.py) |8x3| ResNet50| ImageNet |72.41|90.55|11.1 (25x3 frames)| 8344  | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_320p_1x1x8_100e_kinetics400_rgb_20200702-ef80e3d7.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_f8_kinetics400_shortedge_72.4_90.6.log) | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_100e_kinetics400_rgb/tsn_r50_f8_kinetics400_shortedge_72.4_90.6.log.json)|
 |[tsn_r50_320p_1x1x8_110e_kinetics400_flow](/configs/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow.py) |8x4| ResNet50 | ImageNet|57.76|80.99|x| 8473 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_320p_1x1x8_110e_kinetics400_flow_20200705-1f39486b.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_f8_kinetics400_flow_shortedge_57.8_81.0.log)  | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_320p_1x1x8_110e_kinetics400_flow/tsn_r50_f8_kinetics400_flow_shortedge_57.8_81.0.log.json)|
-|tsn_r50_320p_1x1x8_kinetics400_twostream [1: 1]* |x| ResNet50| ImageNet |74.64|91.77| x | x | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/) | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/)|
-|[tsn_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb.py) |8| ResNet50 | ImageNet|70.77|89.3|12.2 (8x10 frames)|8344| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_dense_1x1x8_100e_kinetics400_rgb_20200606-e925e6e3.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/20200606_003901.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/20200606_003901.log.json)|
-|[tsn_r50_video_1x1x3_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_video_1x1x3_100e_kinetics400_rgb.py) |8| ResNet50| ImageNet | 66.12 | 86.13 |x|8339| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x3_100e_kinetics400_rgb/tsn_r50_video_1x1x3_100e_kinetics400_rgb_20200622-b1e2040b.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x3_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x3_100e_kinetics400_rgb.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x3_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x3_100e_kinetics400_rgb.log.json)|
-|[tsn_r50_video_1x1x5_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_video_1x1x5_100e_kinetics400_rgb.py) |8| ResNet50| ImageNet | 68.24 | 87.79 |x|13631| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x5_100e_kinetics400_rgb/tsn_r50_video_1x1x5_100e_kinetics400_rgb_20200702-fab80595.pth)| [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x5_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x5_100e_kinetics400_rgb.log) | [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x5_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x5_100e_kinetics400_rgb.log.json)|
+|tsn_r50_320p_1x1x8_kinetics400_twostream [1: 1]* |x| ResNet50| ImageNet |74.64|91.77| x | x | x | x | x|
+|[tsn_r50_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb.py) |8| ResNet50 | ImageNet|70.77 (68.75)|89.3 (88.42)|12.2 (8x10 frames)|8344| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_dense_1x1x8_100e_kinetics400_rgb_20200606-e925e6e3.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/20200606_003901.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_dense_1x1x8_100e_kinetics400_rgb/20200606_003901.log.json)|
 |[tsn_r50_video_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb.py) |8| ResNet50| ImageNet | 69.22 | 88.69 |x|21558| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb/tsn_r50_video_1x1x8_100e_kinetics400_rgb_20200702-568cde33.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_100e_kinetics400_rgb.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_100e_kinetics400_rgb.log.json)|
-|[tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb.py) |8| ResNet50| ImageNet | 69.22 | 88.69 |x|21553| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb_20200703-0f19175f.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_dense_100e_kinetics400_rgb.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_dense_100e_kinetics400_rgb.log.json)|
+|[tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb](/configs/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb.py) |8| ResNet50| ImageNet | 70.21 | 89.00 |x|21553| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb_20200703-0f19175f.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_dense_100e_kinetics400_rgb.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb/tsn_r50_video_2d_1x1x8_dense_100e_kinetics400_rgb.log.json)|

 Here, We use [1: 1] to indicate that we combine rgb and flow score with coefficients 1: 1 to get the two-stream prediction (without applying softmax).

@@ -34,15 +30,15 @@ Here, We use [1: 1] to indicate that we combine rgb and flow score with coeffici

 |config | gpus| backbone |pretrain| top1 acc| top5 acc | gpu_mem(M) | ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tsn_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb.py) |8| ResNet50 | ImageNet|18.55|44.80| 10978 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_r50_1x1x8_50e_sthv1_rgb_20200618-061b9195.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_sthv1.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_r50_f8_sthv1_18.1_45.0.log.json)|
-|[tsn_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb.py) |8| ResNet50| ImageNet |15.77|39.85| 5691 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/tsn_r50_1x1x16_50e_sthv1_rgb_20200614-7e2fe4f1.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/20200614_211932.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/20200614_211932.log.json)|
+|[tsn_r50_1x1x8_50e_sthv1_rgb](/configs/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb.py) |8| ResNet50 | ImageNet|18.55 (17.53)|44.80 (44.29)| 10978 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_r50_1x1x8_50e_sthv1_rgb_20200618-061b9195.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_sthv1.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv1_rgb/tsn_r50_f8_sthv1_18.1_45.0.log.json)|
+|[tsn_r50_1x1x16_50e_sthv1_rgb](/configs/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb.py) |8| ResNet50| ImageNet |15.77 (13.33)|39.85 (35.58)| 5691 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/tsn_r50_1x1x16_50e_sthv1_rgb_20200614-7e2fe4f1.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/20200614_211932.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv1_rgb/20200614_211932.log.json)|

 ### Something-Something V2

 |config | gpus| backbone| pretrain | top1 acc| top5 acc | gpu_mem(M) | ckpt | log| json|
 |:--|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|:--:|
-|[tsn_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb.py) |8x2| ResNet50| ImageNet |32.41|64.05| 10978 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/tsn_r50_1x1x8_50e_sthv2_rgb_20200618-096db436.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/tsn_sthv2.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/tsn_r50_f8_sthv2_32.4_64.1.log.json)|
-|[tsn_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb.py) |8| ResNet50| ImageNet |22.48|49.08|5698| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/tsn_r50_1x1x16_50e_sthv2_rgb_20200614-b55c5700.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/20200614_203248.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/20200614_203248.log.json)|
+|[tsn_r50_1x1x8_50e_sthv2_rgb](/configs/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb.py) |8x2| ResNet50| ImageNet |32.41 (30.32)|64.05 (58.38)| 10978 | [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/tsn_r50_1x1x8_50e_sthv2_rgb_20200618-096db436.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/tsn_sthv2.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x8_50e_sthv2_rgb/tsn_r50_f8_sthv2_32.4_64.1.log.json)|
+|[tsn_r50_1x1x16_50e_sthv2_rgb](/configs/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb.py) |8| ResNet50| ImageNet |22.48 (22.50)|49.08 (47.29)|5698| [ckpt](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/tsn_r50_1x1x16_50e_sthv2_rgb_20200614-b55c5700.pth) | [log](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/20200614_203248.log)| [json](https://openmmlab.oss-cn-hangzhou.aliyuncs.com/mmaction/recognition/tsn/tsn_r50_1x1x16_50e_sthv2_rgb/20200614_203248.log.json)|

 ### Moments in Time

@@ -62,6 +58,7 @@ According to the [Linear Scaling Rule](https://arxiv.org/abs/1706.02677), you ma
 e.g., lr=0.01 for 4 GPUs * 2 video/gpu and lr=0.08 for 16 GPUs * 4 video/gpu.
 2. The **inference_time** is got by this [benchmark script](/tools/benchmark.py), where we use the sampling frames strategy of the test setting and only care about the model inference time,
 not including the IO time and pre-processing time. For each setting, we use 1 gpu and set batch size (videos per gpu) to 1 to calculate the inference time.
+3. The values in brackets are the results got by training on the [original repo](https://github.com/mit-han-lab/temporal-shift-module), using the same model settings.

 For more details on data preparation, you can refer to [preparing_ucf101](/tools/data/ucf101/preparing_ucf101.md),
 [preparing_kinetics400](/tools/data/kinetics400/preparing_kinetics400.md), [preparing_sthv1](/tools/data/sthv1/preparing_sthv1.md),

--- a/configs/recognition/tsn/tsn_r50_1x1x5_50e_kinetics400_rgb.py
+++ b/configs/recognition/tsn/tsn_r50_1x1x5_50e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNet',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False),
-    cls_head=dict(
-        type='TSNHead',
-        num_classes=400,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.5,
-        init_std=0.01))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/kinetics400/rawframes_train/'
-data_root_val = 'data/kinetics400/rawframes_val/'
-ann_file_train = 'data/kinetics400/kinetics_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1,
-        num_fixed_crops=13),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=16,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD', lr=0.02, momentum=0.9,
-    weight_decay=0.0001)  # this lr is used for 8 gpus
-optimizer_config = dict(grad_clip=dict(max_norm=20, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[20, 40])
-total_epochs = 50
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tsn_r50_1x1x5_50e_kinetics400_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tsn/tsn_r50_1x1x8_100e_kinetics400_rgb.py
+++ b/configs/recognition/tsn/tsn_r50_1x1x8_100e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNet',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False),
-    cls_head=dict(
-        type='TSNHead',
-        num_classes=400,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.4,
-        init_std=0.01))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'RawframeDataset'
-data_root = 'data/kinetics400/rawframes_train/'
-data_root_val = 'data/kinetics400/rawframes_val/'
-ann_file_train = 'data/kinetics400/kinetics_train_list.txt'
-ann_file_val = 'data/kinetics400/kinetics_val_list.txt'
-ann_file_test = 'data/kinetics400/kinetics_val_list.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1,
-        num_fixed_crops=13),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=8,
-        test_mode=True),
-    dict(type='FrameSelector'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=12,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD', lr=0.015, momentum=0.9,
-    weight_decay=0.0001)  # this lr is used for 8 gpus
-optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[40, 80])
-total_epochs = 100
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=2, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tsn_r50_1x1x8_100e_kinetics400_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tsn/tsn_r50_video_1x1x3_100e_kinetics400_rgb.py
+++ b/configs/recognition/tsn/tsn_r50_video_1x1x3_100e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNet',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False),
-    cls_head=dict(
-        type='TSNHead',
-        num_classes=400,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.4,
-        init_std=0.01))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 's3://lizz.ssd/datasets/kinetics400_256/'
-data_root_val = 's3://lizz.ssd/datasets/kinetics400_256/'
-ann_file_train = 'data/kinetics400/k400_train.txt'
-ann_file_val = 'data/kinetics400/k400_val.txt'
-ann_file_test = 'data/kinetics400/k400_val.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=3),
-    dict(type='DecordDecode'),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=3,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=25,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='TenCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=32,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD', lr=0.01, momentum=0.9,
-    weight_decay=0.0001)  # this lr is used for 8 gpus
-optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[40, 80])
-total_epochs = 100
-checkpoint_config = dict(interval=5)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tsn_r50_video_2d_1x1x3_100e_kinetics400_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tsn/tsn_r50_video_1x1x5_100e_kinetics400_rgb.py
+++ b/configs/recognition/tsn/tsn_r50_video_1x1x5_100e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNet',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False),
-    cls_head=dict(
-        type='TSNHead',
-        num_classes=400,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.4,
-        init_std=0.01))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 's3://lizz.ssd/datasets/kinetics400_256_fast/'
-data_root_val = 's3://lizz.ssd/datasets/kinetics400_256_fast/'
-ann_file_train = 'data/kinetics400/k400_train.txt'
-ann_file_val = 'data/kinetics400/k400_val.txt'
-ann_file_test = 'data/kinetics400/k400_val.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=5),
-    dict(type='DecordDecode'),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=5,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='SampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=25,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='TenCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=32,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD', lr=0.01, momentum=0.9,
-    weight_decay=0.0001)  # this lr is used for 8 gpus
-optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[40, 80])
-total_epochs = 100
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tsn_r50_video_1x1x5_100e_kinetics400_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]
--- a/configs/recognition/tsn/tsn_r50_video_dense_1x1x5_100e_kinetics400_rgb.py
+++ b/configs/recognition/tsn/tsn_r50_video_dense_1x1x5_100e_kinetics400_rgb.py
-# model settings
-model = dict(
-    type='Recognizer2D',
-    backbone=dict(
-        type='ResNet',
-        pretrained='torchvision://resnet50',
-        depth=50,
-        norm_eval=False),
-    cls_head=dict(
-        type='TSNHead',
-        num_classes=400,
-        in_channels=2048,
-        spatial_type='avg',
-        consensus=dict(type='AvgConsensus', dim=1),
-        dropout_ratio=0.4,
-        init_std=0.01))
-# model training and testing settings
-train_cfg = None
-test_cfg = dict(average_clips=None)
-# dataset settings
-dataset_type = 'VideoDataset'
-data_root = 's3://lizz.ssd/datasets/kinetics400_256_fast/'
-data_root_val = 's3://lizz.ssd/datasets/kinetics400_256_fast/'
-ann_file_train = 'data/kinetics400/k400_train.txt'
-ann_file_val = 'data/kinetics400/k400_val.txt'
-ann_file_test = 'data/kinetics400/k400_val.txt'
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False)
-train_pipeline = [
-    dict(type='DecordInit'),
-    dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=5),
-    dict(type='DecordDecode'),
-    dict(
-        type='MultiScaleCrop',
-        input_size=224,
-        scales=(1, 0.875, 0.75, 0.66),
-        random_crop=False,
-        max_wh_scale_gap=1),
-    dict(type='Resize', scale=(224, 224), keep_ratio=False),
-    dict(type='Flip', flip_ratio=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs', 'label'])
-]
-val_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='DenseSampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=5,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='CenterCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-test_pipeline = [
-    dict(type='DecordInit'),
-    dict(
-        type='DenseSampleFrames',
-        clip_len=1,
-        frame_interval=1,
-        num_clips=25,
-        test_mode=True),
-    dict(type='DecordDecode'),
-    dict(type='Resize', scale=(-1, 256)),
-    dict(type='TenCrop', crop_size=224),
-    dict(type='Flip', flip_ratio=0),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='FormatShape', input_format='NCHW'),
-    dict(type='Collect', keys=['imgs', 'label'], meta_keys=[]),
-    dict(type='ToTensor', keys=['imgs'])
-]
-data = dict(
-    videos_per_gpu=32,
-    workers_per_gpu=4,
-    train=dict(
-        type=dataset_type,
-        ann_file=ann_file_train,
-        data_prefix=data_root,
-        pipeline=train_pipeline),
-    val=dict(
-        type=dataset_type,
-        ann_file=ann_file_val,
-        data_prefix=data_root_val,
-        pipeline=val_pipeline),
-    test=dict(
-        type=dataset_type,
-        ann_file=ann_file_test,
-        data_prefix=data_root_val,
-        pipeline=test_pipeline))
-# optimizer
-optimizer = dict(
-    type='SGD', lr=0.01, momentum=0.9,
-    weight_decay=0.0001)  # this lr is used for 8 gpus
-optimizer_config = dict(grad_clip=dict(max_norm=40, norm_type=2))
-# learning policy
-lr_config = dict(policy='step', step=[40, 80])
-total_epochs = 100
-checkpoint_config = dict(interval=1)
-evaluation = dict(
-    interval=5, metrics=['top_k_accuracy', 'mean_class_accuracy'], topk=(1, 5))
-log_config = dict(
-    interval=20,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        # dict(type='TensorboardLoggerHook'),
-    ])
-# runtime settings
-dist_params = dict(backend='nccl')
-log_level = 'INFO'
-work_dir = './work_dirs/tsn_r50_video_dense_1x1x5_100e_kinetics400_rgb/'
-load_from = None
-resume_from = None
-workflow = [('train', 1)]