From 083ea3324f818aeaad71e470e090eeb65de9924f Mon Sep 17 00:00:00 2001 From: Jintao Lin Date: Sun, 9 Aug 2020 08:19:22 +0800 Subject: [PATCH] Move `start_index` from `SampleFrames` to dataset level (#89) --- ...d_r50_video_32x2x1_100e_kinetics400_rgb.py | 9 +-- ...o_inference_32x2x1_100e_kinetics400_rgb.py | 1 - ...1d_r34_video_8x8x1_180e_kinetics400_rgb.py | 9 +-- ...eo_inference_8x8x1_180e_kinetics400_rgb.py | 1 - ...t_r50_video_4x16x1_256e_kinetics400_rgb.py | 9 +-- ...o_inference_4x16x1_256e_kinetics400_rgb.py | 1 - ...y_r50_video_4x16x1_256e_kinetics400_rgb.py | 9 +-- ...o_inference_4x16x1_256e_kinetics400_rgb.py | 1 - ...tsm_r50_video_1x1x8_50e_kinetics400_rgb.py | 9 +-- ...eo_inference_1x1x8_100e_kinetics400_rgb.py | 1 - ...sn_r50_video_1x1x8_100e_kinetics400_rgb.py | 9 +-- ..._video_dense_1x1x8_100e_kinetics400_rgb.py | 9 +-- ...eo_inference_1x1x3_100e_kinetics400_rgb.py | 1 - mmaction/apis/inference.py | 9 ++- mmaction/datasets/base.py | 8 +++ mmaction/datasets/pipelines/loading.py | 38 ++++++------- mmaction/datasets/rawframe_dataset.py | 5 +- mmaction/datasets/video_dataset.py | 13 +++++ tests/test_data/test_dataset.py | 57 +++++++++++++++---- tests/test_data/test_loading.py | 55 +++++++++++++++--- 20 files changed, 150 insertions(+), 104 deletions(-) diff --git a/configs/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb.py b/configs/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb.py index 39021f2..afaa0f6 100644 --- a/configs/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb.py +++ b/configs/recognition/i3d/i3d_r50_video_32x2x1_100e_kinetics400_rgb.py @@ -31,12 +31,7 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=32, - frame_interval=2, - num_clips=1, - start_index=0), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict( @@ -59,7 +54,6 @@ val_pipeline = [ clip_len=32, frame_interval=2, num_clips=1, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -77,7 +71,6 @@ test_pipeline = [ clip_len=32, frame_interval=2, num_clips=10, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py b/configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py index 2d08cf6..a58f82a 100644 --- a/configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py +++ b/configs/recognition/i3d/i3d_r50_video_inference_32x2x1_100e_kinetics400_rgb.py @@ -30,7 +30,6 @@ test_pipeline = [ clip_len=32, frame_interval=2, num_clips=1, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py b/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py index 402bb45..d1c7b4f 100644 --- a/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py +++ b/configs/recognition/r2plus1d/r2plus1d_r34_video_8x8x1_180e_kinetics400_rgb.py @@ -38,12 +38,7 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=8, - frame_interval=8, - num_clips=1, - start_index=0), + dict(type='SampleFrames', clip_len=8, frame_interval=8, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='RandomResizedCrop'), @@ -61,7 +56,6 @@ val_pipeline = [ clip_len=8, frame_interval=8, num_clips=1, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -79,7 +73,6 @@ test_pipeline = [ clip_len=8, frame_interval=8, num_clips=10, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/r2plus1d/r2plus1d_r34_video_inference_8x8x1_180e_kinetics400_rgb.py b/configs/recognition/r2plus1d/r2plus1d_r34_video_inference_8x8x1_180e_kinetics400_rgb.py index 134b050..6d1f23a 100644 --- a/configs/recognition/r2plus1d/r2plus1d_r34_video_inference_8x8x1_180e_kinetics400_rgb.py +++ b/configs/recognition/r2plus1d/r2plus1d_r34_video_inference_8x8x1_180e_kinetics400_rgb.py @@ -38,7 +38,6 @@ test_pipeline = [ clip_len=8, frame_interval=8, num_clips=10, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py b/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py index 2d1dc66..c75f6f8 100644 --- a/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py +++ b/configs/recognition/slowfast/slowfast_r50_video_4x16x1_256e_kinetics400_rgb.py @@ -45,12 +45,7 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=32, - frame_interval=2, - num_clips=1, - start_index=0), + dict(type='SampleFrames', clip_len=32, frame_interval=2, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='RandomResizedCrop'), @@ -68,7 +63,6 @@ val_pipeline = [ clip_len=32, frame_interval=2, num_clips=1, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -86,7 +80,6 @@ test_pipeline = [ clip_len=32, frame_interval=2, num_clips=10, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/slowfast/slowfast_r50_video_inference_4x16x1_256e_kinetics400_rgb.py b/configs/recognition/slowfast/slowfast_r50_video_inference_4x16x1_256e_kinetics400_rgb.py index 13249a6..26246fe 100644 --- a/configs/recognition/slowfast/slowfast_r50_video_inference_4x16x1_256e_kinetics400_rgb.py +++ b/configs/recognition/slowfast/slowfast_r50_video_inference_4x16x1_256e_kinetics400_rgb.py @@ -48,7 +48,6 @@ test_pipeline = [ clip_len=32, frame_interval=2, num_clips=10, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py b/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py index fbe7f44..1ac5688 100644 --- a/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py +++ b/configs/recognition/slowonly/slowonly_r50_video_4x16x1_256e_kinetics400_rgb.py @@ -28,12 +28,7 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=4, - frame_interval=16, - num_clips=1, - start_index=0), + dict(type='SampleFrames', clip_len=4, frame_interval=16, num_clips=1), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict(type='RandomResizedCrop'), @@ -51,7 +46,6 @@ val_pipeline = [ clip_len=4, frame_interval=16, num_clips=1, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -69,7 +63,6 @@ test_pipeline = [ clip_len=4, frame_interval=16, num_clips=10, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/slowonly/slowonly_r50_video_inference_4x16x1_256e_kinetics400_rgb.py b/configs/recognition/slowonly/slowonly_r50_video_inference_4x16x1_256e_kinetics400_rgb.py index fa8b57e..24d48ca 100644 --- a/configs/recognition/slowonly/slowonly_r50_video_inference_4x16x1_256e_kinetics400_rgb.py +++ b/configs/recognition/slowonly/slowonly_r50_video_inference_4x16x1_256e_kinetics400_rgb.py @@ -28,7 +28,6 @@ test_pipeline = [ clip_len=4, frame_interval=16, num_clips=10, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py index 2e9c35d..e31d1d2 100644 --- a/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py +++ b/configs/recognition/tsm/tsm_r50_video_1x1x8_50e_kinetics400_rgb.py @@ -30,12 +30,7 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - start_index=0), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), dict( @@ -59,7 +54,6 @@ val_pipeline = [ clip_len=1, frame_interval=1, num_clips=8, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -77,7 +71,6 @@ test_pipeline = [ clip_len=1, frame_interval=1, num_clips=8, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py index fd60092..082943a 100644 --- a/configs/recognition/tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py +++ b/configs/recognition/tsm/tsm_r50_video_inference_1x1x8_100e_kinetics400_rgb.py @@ -30,7 +30,6 @@ test_pipeline = [ clip_len=1, frame_interval=1, num_clips=8, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb.py index f3a476e..f1449a5 100644 --- a/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb.py +++ b/configs/recognition/tsn/tsn_r50_video_1x1x8_100e_kinetics400_rgb.py @@ -28,12 +28,7 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), - dict( - type='SampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - start_index=0), + dict(type='SampleFrames', clip_len=1, frame_interval=1, num_clips=8), dict(type='DecordDecode'), dict( type='MultiScaleCrop', @@ -55,7 +50,6 @@ val_pipeline = [ clip_len=1, frame_interval=1, num_clips=8, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -73,7 +67,6 @@ test_pipeline = [ clip_len=1, frame_interval=1, num_clips=25, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb.py b/configs/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb.py index aa0d7fc..9657dfe 100644 --- a/configs/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb.py +++ b/configs/recognition/tsn/tsn_r50_video_dense_1x1x8_100e_kinetics400_rgb.py @@ -28,12 +28,7 @@ img_norm_cfg = dict( mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_bgr=False) train_pipeline = [ dict(type='DecordInit'), - dict( - type='DenseSampleFrames', - clip_len=1, - frame_interval=1, - num_clips=8, - start_index=0), + dict(type='DenseSampleFrames', clip_len=1, frame_interval=1, num_clips=8), dict(type='DecordDecode'), dict( type='MultiScaleCrop', @@ -55,7 +50,6 @@ val_pipeline = [ clip_len=1, frame_interval=1, num_clips=8, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), @@ -73,7 +67,6 @@ test_pipeline = [ clip_len=1, frame_interval=1, num_clips=8, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py b/configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py index ca644a4..5a93fad 100644 --- a/configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py +++ b/configs/recognition/tsn/tsn_r50_video_inference_1x1x3_100e_kinetics400_rgb.py @@ -27,7 +27,6 @@ test_pipeline = [ clip_len=1, frame_interval=1, num_clips=25, - start_index=0, test_mode=True), dict(type='DecordDecode'), dict(type='Resize', scale=(-1, 256)), diff --git a/mmaction/apis/inference.py b/mmaction/apis/inference.py index d64ad46..33952f1 100644 --- a/mmaction/apis/inference.py +++ b/mmaction/apis/inference.py @@ -88,15 +88,22 @@ def inference_recognizer(model, video_path, label_path, use_frames=False): if use_frames: filename_tmpl = cfg.data.test.get('filename_tmpl', 'img_{:05}.jpg') modality = cfg.data.test.get('modality', 'RGB') + start_index = cfg.data.test.get('start_index', 1) data = dict( frame_dir=video_path, total_frames=len(os.listdir(video_path)), # assuming files in ``video_path`` are all named with ``filename_tmpl`` # noqa: E501 label=-1, + start_index=start_index, filename_tmpl=filename_tmpl, modality=modality) else: - data = dict(filename=video_path, label=-1, modality='RGB') + start_index = cfg.data.test.get('start_index', 0) + data = dict( + filename=video_path, + label=-1, + start_index=start_index, + modality='RGB') data = test_pipeline(data) data = collate([data], samples_per_gpu=1) if next(model.parameters()).is_cuda: diff --git a/mmaction/datasets/base.py b/mmaction/datasets/base.py index 8a04a43..25cb486 100644 --- a/mmaction/datasets/base.py +++ b/mmaction/datasets/base.py @@ -32,6 +32,10 @@ class BaseDataset(Dataset, metaclass=ABCMeta): dataset. Default: False. num_classes (int): Number of classes of the dataset, used in multi-class datasets. Default: None. + start_index (int): Specify a start index for frames in consideration of + different filename format. However, when taking videos as input, + it should be set to 0, since frames loaded from videos count + from 0. Default: 1. modality (str): Modality of data. Support 'RGB', 'Flow'. Default: 'RGB'. """ @@ -43,6 +47,7 @@ class BaseDataset(Dataset, metaclass=ABCMeta): test_mode=False, multi_class=False, num_classes=None, + start_index=1, modality='RGB'): super().__init__() @@ -52,6 +57,7 @@ class BaseDataset(Dataset, metaclass=ABCMeta): self.test_mode = test_mode self.multi_class = multi_class self.num_classes = num_classes + self.start_index = start_index self.modality = modality self.pipeline = Compose(pipeline) self.video_infos = self.load_annotations() @@ -83,12 +89,14 @@ class BaseDataset(Dataset, metaclass=ABCMeta): """Prepare the frames for training given the index.""" results = copy.deepcopy(self.video_infos[idx]) results['modality'] = self.modality + results['start_index'] = self.start_index return self.pipeline(results) def prepare_test_frames(self, idx): """Prepare the frames for testing given the index.""" results = copy.deepcopy(self.video_infos[idx]) results['modality'] = self.modality + results['start_index'] = self.start_index return self.pipeline(results) def __len__(self): diff --git a/mmaction/datasets/pipelines/loading.py b/mmaction/datasets/pipelines/loading.py index 841d232..fb73154 100644 --- a/mmaction/datasets/pipelines/loading.py +++ b/mmaction/datasets/pipelines/loading.py @@ -17,18 +17,14 @@ from ..registry import PIPELINES class SampleFrames(object): """Sample frames from the video. - Required keys are "filename", "total_frames", added or modified keys are - "frame_inds", "frame_interval" and "num_clips". + Required keys are "filename", "total_frames", "start_index" , added or + modified keys are "frame_inds", "frame_interval" and "num_clips". Args: clip_len (int): Frames of each sampled output clip. frame_interval (int): Temporal interval of adjacent sampled frames. Default: 1. num_clips (int): Number of clips to be sampled. Default: 1. - start_index (int): Specify a start index for frames in consideration of - different filename format. However, when taking videos as input, - it should be set to 0, since frames loaded from videos count - from 0. Default: 1. temporal_jitter (bool): Whether to apply temporal jittering. Default: False. twice_sample (bool): Whether to use twice sample when testing. @@ -39,28 +35,35 @@ class SampleFrames(object): Default: 'loop'. test_mode (bool): Store True when building test or validation dataset. Default: False. + start_index (None): This argument is deprecated and moved to dataset + class (``BaseDataset``, ``VideoDatset``, ``RawframeDataset``, etc), + see this: https://github.com/open-mmlab/mmaction2/pull/89. """ def __init__(self, clip_len, frame_interval=1, num_clips=1, - start_index=1, temporal_jitter=False, twice_sample=False, out_of_bound_opt='loop', - test_mode=False): + test_mode=False, + start_index=None): self.clip_len = clip_len self.frame_interval = frame_interval self.num_clips = num_clips - self.start_index = start_index self.temporal_jitter = temporal_jitter self.twice_sample = twice_sample self.out_of_bound_opt = out_of_bound_opt self.test_mode = test_mode assert self.out_of_bound_opt in ['loop', 'repeat_last'] + if start_index is not None: + warnings.warn('No longer support "start_index" in "SampleFrames", ' + 'it should be set in dataset class, see this pr: ' + 'https://github.com/open-mmlab/mmaction2/pull/89') + def _get_train_clips(self, num_frames): """Get clip offsets in train mode. @@ -165,7 +168,9 @@ class SampleFrames(object): frame_inds = new_inds else: raise ValueError('Illegal out_of_bound option.') - frame_inds = np.concatenate(frame_inds) + self.start_index + + start_index = results['start_index'] + frame_inds = np.concatenate(frame_inds) + start_index results['frame_inds'] = frame_inds.astype(np.int) results['clip_len'] = self.clip_len results['frame_interval'] = self.frame_interval @@ -185,8 +190,6 @@ class DenseSampleFrames(SampleFrames): frame_interval (int): Temporal interval of adjacent sampled frames. Default: 1. num_clips (int): Number of clips to be sampled. Default: 1. - start_index (int): Specify a start index for frames in consideration of - different filename format. Default: 1. sample_range (int): Total sample range for dense sample. Default: 64. num_sample_positions (int): Number of sample start positions, Which is @@ -201,7 +204,6 @@ class DenseSampleFrames(SampleFrames): clip_len, frame_interval=1, num_clips=1, - start_index=1, sample_range=64, num_sample_positions=10, temporal_jitter=False, @@ -211,7 +213,6 @@ class DenseSampleFrames(SampleFrames): clip_len, frame_interval, num_clips, - start_index, temporal_jitter, out_of_bound_opt=out_of_bound_opt, test_mode=test_mode) @@ -285,10 +286,6 @@ class SampleProposalFrames(SampleFrames): Default: 1. test_interval (int): Temporal interval of adjacent sampled frames in test mode. Default: 6. - start_index (int): Specify a start index for frames in consideration of - different filename format. However, when taking videos as input, - it should be set to 0, since frames loaded from videos count - from 0. Default: 1. temporal_jitter (bool): Whether to apply temporal jittering. Default: False. mode (str): Choose 'train', 'val' or 'test' mode. @@ -302,13 +299,11 @@ class SampleProposalFrames(SampleFrames): aug_ratio, frame_interval=1, test_interval=6, - start_index=1, temporal_jitter=False, mode='train'): super().__init__( clip_len, frame_interval=frame_interval, - start_index=start_index, temporal_jitter=temporal_jitter) self.body_segments = body_segments self.aug_segments = aug_segments @@ -500,7 +495,8 @@ class SampleProposalFrames(SampleFrames): self.frame_interval, size=len(frame_inds)) frame_inds += perframe_offsets - frame_inds = np.mod(frame_inds, total_frames) + self.start_index + start_index = results['start_index'] + frame_inds = np.mod(frame_inds, total_frames) + start_index results['frame_inds'] = np.array(frame_inds).astype(np.int) results['clip_len'] = self.clip_len diff --git a/mmaction/datasets/rawframe_dataset.py b/mmaction/datasets/rawframe_dataset.py index 3fdae69..b85be9c 100644 --- a/mmaction/datasets/rawframe_dataset.py +++ b/mmaction/datasets/rawframe_dataset.py @@ -85,11 +85,12 @@ class RawframeDataset(BaseDataset): with_offset=False, multi_class=False, num_classes=None, + start_index=1, modality='RGB'): self.filename_tmpl = filename_tmpl self.with_offset = with_offset super().__init__(ann_file, pipeline, data_prefix, test_mode, - multi_class, num_classes, modality) + multi_class, num_classes, start_index, modality) def load_annotations(self): """Load annotation file to get video information.""" @@ -134,6 +135,7 @@ class RawframeDataset(BaseDataset): results = copy.deepcopy(self.video_infos[idx]) results['filename_tmpl'] = self.filename_tmpl results['modality'] = self.modality + results['start_index'] = self.start_index return self.pipeline(results) def prepare_test_frames(self, idx): @@ -141,6 +143,7 @@ class RawframeDataset(BaseDataset): results = copy.deepcopy(self.video_infos[idx]) results['filename_tmpl'] = self.filename_tmpl results['modality'] = self.modality + results['start_index'] = self.start_index return self.pipeline(results) def evaluate(self, diff --git a/mmaction/datasets/video_dataset.py b/mmaction/datasets/video_dataset.py index 560ce26..99a7b42 100644 --- a/mmaction/datasets/video_dataset.py +++ b/mmaction/datasets/video_dataset.py @@ -27,8 +27,21 @@ class VideoDataset(BaseDataset): some/path/003.mp4 2 some/path/004.mp4 3 some/path/005.mp4 3 + + + Args: + ann_file (str): Path to the annotation file. + pipeline (list[dict | callable]): A sequence of data transforms. + start_index (int): Specify a start index for frames in consideration of + different filename format. However, when taking videos as input, + it should be set to 0, since frames loaded from videos count + from 0. Default: 0. + **kwargs: Keyword arguments for ``BaseDataset``. """ + def __init__(self, ann_file, pipeline, start_index=0, **kwargs): + super().__init__(ann_file, pipeline, start_index=start_index, **kwargs) + def load_annotations(self): """Load annotation file to get video information.""" video_infos = [] diff --git a/tests/test_data/test_dataset.py b/tests/test_data/test_dataset.py index 0e6419f..539ab8a 100644 --- a/tests/test_data/test_dataset.py +++ b/tests/test_data/test_dataset.py @@ -59,6 +59,7 @@ class TestDataset(object): assert rawframe_infos == [ dict(frame_dir=frame_dir, total_frames=5, label=127) ] * 2 + assert rawframe_dataset.start_index == 1 def test_rawframe_dataset_with_offset(self): rawframe_dataset = RawframeDataset( @@ -71,6 +72,7 @@ class TestDataset(object): assert rawframe_infos == [ dict(frame_dir=frame_dir, offset=2, total_frames=5, label=127) ] * 2 + assert rawframe_dataset.start_index == 1 def test_rawframe_dataset_multi_label(self): rawframe_dataset = RawframeDataset( @@ -90,6 +92,7 @@ class TestDataset(object): assert info['frame_dir'] == frame_dir assert info['total_frames'] == 5 assert torch.all(info['label'] == label) + assert rawframe_dataset.start_index == 1 def test_dataset_realpath(self): dataset = RawframeDataset(self.frame_ann_file, self.frame_pipeline, @@ -100,14 +103,20 @@ class TestDataset(object): assert dataset.data_prefix == 's3://good' def test_video_dataset(self): - video_dataset = VideoDataset(self.video_ann_file, self.video_pipeline, - self.data_prefix) + video_dataset = VideoDataset( + self.video_ann_file, + self.video_pipeline, + data_prefix=self.data_prefix) video_infos = video_dataset.video_infos video_filename = osp.join(self.data_prefix, 'test.mp4') assert video_infos == [dict(filename=video_filename, label=0)] * 2 + assert video_dataset.start_index == 0 def test_rawframe_pipeline(self): - target_keys = ['frame_dir', 'total_frames', 'label', 'filename_tmpl'] + target_keys = [ + 'frame_dir', 'total_frames', 'label', 'filename_tmpl', + 'start_index', 'modality' + ] # RawframeDataset not in test mode rawframe_dataset = RawframeDataset( @@ -129,6 +138,17 @@ class TestDataset(object): result = rawframe_dataset[0] assert self.check_keys_contain(result.keys(), target_keys) + # RawframeDataset with offset + rawframe_dataset = RawframeDataset( + self.frame_ann_file_with_offset, + self.frame_pipeline, + self.data_prefix, + with_offset=True, + num_classes=400, + test_mode=False) + result = rawframe_dataset[0] + assert self.check_keys_contain(result.keys(), target_keys + ['offset']) + # RawframeDataset in test mode rawframe_dataset = RawframeDataset( self.frame_ann_file, @@ -149,14 +169,25 @@ class TestDataset(object): result = rawframe_dataset[0] assert self.check_keys_contain(result.keys(), target_keys) + # RawframeDataset with offset + rawframe_dataset = RawframeDataset( + self.frame_ann_file_with_offset, + self.frame_pipeline, + self.data_prefix, + with_offset=True, + num_classes=400, + test_mode=True) + result = rawframe_dataset[0] + assert self.check_keys_contain(result.keys(), target_keys + ['offset']) + def test_video_pipeline(self): - target_keys = ['filename', 'label'] + target_keys = ['filename', 'label', 'start_index', 'modality'] # VideoDataset not in test mode video_dataset = VideoDataset( self.video_ann_file, self.video_pipeline, - self.data_prefix, + data_prefix=self.data_prefix, test_mode=False) result = video_dataset[0] assert self.check_keys_contain(result.keys(), target_keys) @@ -165,7 +196,7 @@ class TestDataset(object): video_dataset = VideoDataset( self.video_ann_file, self.video_pipeline, - self.data_prefix, + data_prefix=self.data_prefix, test_mode=True) result = video_dataset[0] assert self.check_keys_contain(result.keys(), target_keys) @@ -221,8 +252,10 @@ class TestDataset(object): ['top1_acc', 'top5_acc', 'mean_class_accuracy']) def test_video_evaluate(self): - video_dataset = VideoDataset(self.video_ann_file, self.video_pipeline, - self.data_prefix) + video_dataset = VideoDataset( + self.video_ann_file, + self.video_pipeline, + data_prefix=self.data_prefix) with pytest.raises(TypeError): # results must be a list @@ -248,10 +281,14 @@ class TestDataset(object): ['top1_acc', 'top5_acc', 'mean_class_accuracy']) def test_base_dataset(self): - video_dataset = VideoDataset(self.video_ann_file, self.video_pipeline, - self.data_prefix) + video_dataset = VideoDataset( + self.video_ann_file, + self.video_pipeline, + data_prefix=self.data_prefix, + start_index=3) assert len(video_dataset) == 2 assert type(video_dataset[0]) == dict + assert video_dataset.start_index == 3 def test_repeat_dataset(self): rawframe_dataset = RawframeDataset(self.frame_ann_file, diff --git a/tests/test_data/test_loading.py b/tests/test_data/test_loading.py index bf5bc34..e0b180d 100644 --- a/tests/test_data/test_loading.py +++ b/tests/test_data/test_loading.py @@ -60,11 +60,15 @@ class TestLoading(object): cls.flow_filename_tmpl = '{}_{:05d}.jpg' video_total_frames = len(mmcv.VideoReader(cls.video_path)) cls.video_results = dict( - filename=cls.video_path, label=1, total_frames=video_total_frames) + filename=cls.video_path, + label=1, + total_frames=video_total_frames, + start_index=0) cls.frame_results = dict( frame_dir=cls.img_dir, total_frames=cls.total_frames, filename_tmpl=cls.filename_tmpl, + start_index=1, modality='RGB', offset=0, label=1) @@ -92,6 +96,7 @@ class TestLoading(object): video_id='test_imgs', total_frames=cls.total_frames, filename_tmpl=cls.filename_tmpl, + start_index=1, out_props=[[['test_imgs', ExampleSSNInstance(1, 4, 10, 1, 1, 1)], 0], [['test_imgs', @@ -103,6 +108,12 @@ class TestLoading(object): 'total_frames' ] + with pytest.warns(UserWarning): + # start_index has been deprecated + config = dict( + clip_len=3, frame_interval=1, num_clips=5, start_index=1) + SampleFrames(**config) + # Sample Frame with no temporal_jitter # clip_len=3, frame_interval=1, num_clips=5 video_result = copy.deepcopy(self.video_results) @@ -116,6 +127,8 @@ class TestLoading(object): assert len(sample_frames_results['frame_inds']) == 15 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 15 + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 # Sample Frame with no temporal_jitter # clip_len=5, frame_interval=1, num_clips=5, @@ -150,6 +163,8 @@ class TestLoading(object): frame_inds = sample_frames_results['frame_inds'].reshape([5, 5]) for i in range(5): assert check_monotonous(frame_inds[i]) + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 # Sample Frame with temporal_jitter # clip_len=4, frame_interval=2, num_clips=5 @@ -164,6 +179,8 @@ class TestLoading(object): assert len(sample_frames_results['frame_inds']) == 20 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 20 + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 # Sample Frame with no temporal_jitter in test mode # clip_len=4, frame_interval=1, num_clips=6 @@ -182,6 +199,8 @@ class TestLoading(object): assert len(sample_frames_results['frame_inds']) == 24 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 24 + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 # Sample Frame with no temporal_jitter in test mode # clip_len=3, frame_interval=1, num_clips=6 @@ -200,6 +219,8 @@ class TestLoading(object): assert len(sample_frames_results['frame_inds']) == 18 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 18 + assert np.max(sample_frames_results['frame_inds']) <= 5 + assert np.min(sample_frames_results['frame_inds']) >= 1 # Sample Frame with no temporal_jitter to get clip_offsets # clip_len=1, frame_interval=1, num_clips=8 @@ -223,7 +244,7 @@ class TestLoading(object): np.array([1, 2, 2, 3, 4, 5, 5, 6])) # Sample Frame with no temporal_jitter to get clip_offsets - # clip_len=1, frame_interval=1, num_clips=8, start_index=0 + # clip_len=1, frame_interval=1, num_clips=8 video_result = copy.deepcopy(self.video_results) frame_result = copy.deepcopy(self.frame_results) frame_result['total_frames'] = 6 @@ -231,18 +252,18 @@ class TestLoading(object): clip_len=1, frame_interval=1, num_clips=8, - start_index=0, temporal_jitter=False, test_mode=True) sample_frames = SampleFrames(**config) sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 assert self.check_keys_contain(sample_frames_results.keys(), target_keys) assert len(sample_frames_results['frame_inds']) == 8 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 8 assert_array_equal(sample_frames_results['frame_inds'], - np.array([0, 1, 1, 2, 3, 4, 4, 5])) + np.array([1, 2, 2, 3, 4, 5, 5, 6])) # Sample Frame with no temporal_jitter to get clip_offsets zero # clip_len=6, frame_interval=1, num_clips=1 @@ -257,6 +278,7 @@ class TestLoading(object): test_mode=True) sample_frames = SampleFrames(**config) sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 assert self.check_keys_contain(sample_frames_results.keys(), target_keys) assert len(sample_frames_results['frame_inds']) == 6 @@ -278,11 +300,14 @@ class TestLoading(object): test_mode=False) sample_frames = SampleFrames(**config) sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 assert self.check_keys_contain(sample_frames_results.keys(), target_keys) assert len(sample_frames_results['frame_inds']) == 240 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 240 + assert np.max(sample_frames_results['frame_inds']) <= 30 + assert np.min(sample_frames_results['frame_inds']) >= 1 # Sample Frame with no temporal_jitter to get clip_offsets # clip_len=1, frame_interval=1, num_clips=8 @@ -299,6 +324,7 @@ class TestLoading(object): sample_frames_results = sample_frames(video_result) assert self.check_keys_contain(sample_frames_results.keys(), target_keys) + assert sample_frames_results['start_index'] == 0 assert len(sample_frames_results['frame_inds']) == 8 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 8 @@ -318,11 +344,14 @@ class TestLoading(object): test_mode=False) sample_frames = SampleFrames(**config) sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 assert self.check_keys_contain(sample_frames_results.keys(), target_keys) assert len(sample_frames_results['frame_inds']) == 24 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 24 + assert np.max(sample_frames_results['frame_inds']) <= 10 + assert np.min(sample_frames_results['frame_inds']) >= 1 # Sample Frame using twice sample # clip_len=12, frame_interval=1, num_clips=2 @@ -338,11 +367,14 @@ class TestLoading(object): test_mode=True) sample_frames = SampleFrames(**config) sample_frames_results = sample_frames(video_result) + assert sample_frames_results['start_index'] == 0 assert self.check_keys_contain(sample_frames_results.keys(), target_keys) assert len(sample_frames_results['frame_inds']) == 48 sample_frames_results = sample_frames(frame_result) assert len(sample_frames_results['frame_inds']) == 48 + assert np.max(sample_frames_results['frame_inds']) <= 40 + assert np.min(sample_frames_results['frame_inds']) >= 1 def test_dense_sample_frames(self): target_keys = [ @@ -362,6 +394,7 @@ class TestLoading(object): test_mode=True) dense_sample_frames = DenseSampleFrames(**config) dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 assert self.check_keys_contain(dense_sample_frames_results.keys(), target_keys) assert len(dense_sample_frames_results['frame_inds']) == 240 @@ -376,6 +409,7 @@ class TestLoading(object): clip_len=4, frame_interval=1, num_clips=6, temporal_jitter=False) dense_sample_frames = DenseSampleFrames(**config) dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 assert self.check_keys_contain(dense_sample_frames_results.keys(), target_keys) assert len(dense_sample_frames_results['frame_inds']) == 24 @@ -395,6 +429,7 @@ class TestLoading(object): test_mode=True) dense_sample_frames = DenseSampleFrames(**config) dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 assert self.check_keys_contain(dense_sample_frames_results.keys(), target_keys) assert len(dense_sample_frames_results['frame_inds']) == 240 @@ -413,6 +448,7 @@ class TestLoading(object): temporal_jitter=False) dense_sample_frames = DenseSampleFrames(**config) dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 assert self.check_keys_contain(dense_sample_frames_results.keys(), target_keys) assert len(dense_sample_frames_results['frame_inds']) == 24 @@ -431,6 +467,7 @@ class TestLoading(object): temporal_jitter=False) dense_sample_frames = DenseSampleFrames(**config) dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 assert self.check_keys_contain(dense_sample_frames_results.keys(), target_keys) assert len(dense_sample_frames_results['frame_inds']) == 24 @@ -452,6 +489,7 @@ class TestLoading(object): test_mode=True) dense_sample_frames = DenseSampleFrames(**config) dense_sample_frames_results = dense_sample_frames(video_result) + assert dense_sample_frames_results['start_index'] == 0 assert self.check_keys_contain(dense_sample_frames_results.keys(), target_keys) assert len(dense_sample_frames_results['frame_inds']) == 120 @@ -461,7 +499,7 @@ class TestLoading(object): def test_sample_proposal_frames(self): target_keys = [ 'frame_inds', 'clip_len', 'frame_interval', 'num_clips', - 'total_frames' + 'total_frames', 'start_index' ] # test error cases @@ -475,7 +513,7 @@ class TestLoading(object): aug_ratio=0.5, temporal_jitter=False) sample_frames = SampleProposalFrames(**config) - sample_frames_results = sample_frames(proposal_result) + sample_frames(proposal_result) # test normal cases # Sample Frame with no temporal_jitter @@ -839,7 +877,7 @@ class TestLoading(object): def test_rawframe_decode(self): target_keys = ['frame_inds', 'imgs', 'original_shape', 'modality'] - # test frame selector with 2 dim input when start_index = 0 + # test frame selector with 2 dim input inputs = copy.deepcopy(self.frame_results) inputs['frame_inds'] = np.arange(0, self.total_frames, 2)[:, np.newaxis] @@ -887,7 +925,7 @@ class TestLoading(object): 320, 3) assert results['original_shape'] == (240, 320) - # test frame selector with 1 dim input when start_index = 0 + # test frame selector with 1 dim input inputs = copy.deepcopy(self.frame_results) inputs['frame_inds'] = np.arange(0, self.total_frames, 2) # since the test images start with index 1, we plus 1 to frame_inds @@ -911,7 +949,6 @@ class TestLoading(object): assert results['original_shape'] == (240, 320) # test frame selector with 1 dim input for flow images - # when start_index = 0 inputs = copy.deepcopy(self.flow_frame_results) inputs['frame_inds'] = np.arange(0, self.total_frames, 2) # since the test images start with index 1, we plus 1 to frame_inds -- GitLab