Hd/flow pipeline

f72cdb23 · dhd · lizz · b53f6729 · f72cdb23 · f72cdb23
16 changed file
--- a/mmaction/datasets/pipelines/augmentations.py
+++ b/mmaction/datasets/pipelines/augmentations.py
@@ -62,7 +62,7 @@ class Fuse(object):

        # crop
        left, top, right, bottom = lazyop['crop_bbox'].round().astype(int)
-        imgs = [img[top:bottom, left:right, :] for img in imgs]
+        imgs = [img[top:bottom, left:right] for img in imgs]

        # resize
        img_h, img_w = results['img_shape']
@@ -132,7 +132,7 @@ class RandomCrop(object):

        if not self.lazy:
            results['imgs'] = [
-                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w, :]
+                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]
                for img in results['imgs']
            ]
        else:
@@ -261,7 +261,7 @@ class RandomResizedCrop(object):

        if not self.lazy:
            results['imgs'] = [
-                img[top:bottom, left:right, :] for img in results['imgs']
+                img[top:bottom, left:right] for img in results['imgs']
            ]
        else:
            lazyop = results['lazy']
@@ -407,7 +407,7 @@ class MultiScaleCrop(object):

        if not self.lazy:
            results['imgs'] = [
-                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w, :]
+                img[y_offset:y_offset + new_h, x_offset:x_offset + new_w]
                for img in results['imgs']
            ]
        else:
@@ -443,9 +443,10 @@ class MultiScaleCrop(object):
 class Resize(object):
    """Resize images to a specific size.

-    Required keys are "imgs", "img_shape", added or modified keys are "imgs",
-    "img_shape", "keep_ratio", "scale_factor", "lazy" and "resize_size".
-    Required keys in "lazy" is None, added or modified key is "interpolation".
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "img_shape", "keep_ratio", "scale_factor", "lazy",
+    "resize_size". Required keys in "lazy" is None, added or modified key is
+    "interpolation".

    Args:
        scale (float | Tuple[int]): If keep_ratio is True, it serves as scaling
@@ -491,24 +492,24 @@ class Resize(object):
            results (dict): The resulting dict to be modified and passed
                to the next transform in pipeline.
        """
+
        _init_lazy_if_proper(results, self.lazy)

+        if 'scale_factor' not in results:
+            results['scale_factor'] = np.array([1, 1], dtype=np.float32)
        img_h, img_w = results['img_shape']

        if self.keep_ratio:
-            new_size, self.scale_factor = mmcv.rescale_size((img_w, img_h),
-                                                            self.scale,
-                                                            return_scale=True)
-            new_w, new_h = new_size
+            new_w, new_h = mmcv.rescale_size((img_w, img_h), self.scale)
        else:
            new_w, new_h = self.scale
-            self.scale_factor = np.array(
-                [new_w / img_w, new_h / img_h, new_w / img_w, new_h / img_h],
-                dtype=np.float32)
+
+        self.scale_factor = np.array([new_w / img_w, new_h / img_h],
+                                     dtype=np.float32)

        results['img_shape'] = (new_h, new_w)
        results['keep_ratio'] = self.keep_ratio
-        results['scale_factor'] = self.scale_factor
+        results['scale_factor'] = results['scale_factor'] * self.scale_factor

        if not self.lazy:
            results['imgs'] = [
@@ -538,9 +539,9 @@ class Flip(object):

    Reverse the order of elements in the given imgs with a specific direction.
    The shape of the imgs is preserved, but the elements are reordered.
-    Required keys are "imgs", "img_shape", added or modified keys are "imgs",
-    "lazy"and "flip_direction". Required keys in "lazy" is None, added or
-    modified key are "flip" and "flip_direction".
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs", "lazy" and "flip_direction". Required keys in "lazy" is
+    None, added or modified key are "flip" and "flip_direction".

    Args:
        flip_ratio (float): Probability of implementing flip. Default: 0.5.
@@ -566,6 +567,9 @@ class Flip(object):
                to the next transform in pipeline.
        """
        _init_lazy_if_proper(results, self.lazy)
+        modality = results['modality']
+        if modality == 'Flow':
+            assert self.direction == 'horizontal'

        if np.random.rand() < self.flip_ratio:
            flip = True
@@ -577,8 +581,15 @@ class Flip(object):

        if not self.lazy:
            if flip:
-                for img in results['imgs']:
+                for i, img in enumerate(results['imgs']):
                    mmcv.imflip_(img, self.direction)
+                lt = len(results['imgs'])
+                for i in range(0, lt, 2):
+                    # flow with even indexes are x_flow, which need to be
+                    # inverted when doing horizontal flip
+                    if modality == 'Flow':
+                        results['imgs'][i] = mmcv.iminvert(results['imgs'][i])
+
            else:
                results['imgs'] = list(results['imgs'])
        else:
@@ -602,17 +613,21 @@ class Flip(object):
 class Normalize(object):
    """Normalize images with the given mean and std value.

-    Required keys are "imgs", "img_shape", added or modified keys are "imgs"
-    and "img_norm_cfg".
+    Required keys are "imgs", "img_shape", "modality", added or modified
+    keys are "imgs" and "img_norm_cfg". If modality is 'Flow', additional
+    keys "scale_factor" is required

    Args:
        mean (Sequence[float]): Mean values of different channels.
        std (Sequence[float]): Std values of different channels.
        to_bgr (bool): Whether to convert channels from RGB to BGR.
            Default: False.
+        adjust_magnitude (bool): Indicate whether to adjust the flow magnitude
+            on 'scale_factor' when modality is 'Flow'. Default: False.
+
    """

-    def __init__(self, mean, std, to_bgr=False):
+    def __init__(self, mean, std, to_bgr=False, adjust_magnitude=False):
        if not isinstance(mean, Sequence):
            raise TypeError(
                f'Mean must be list, tuple or np.ndarray, but got {type(mean)}'
@@ -625,31 +640,60 @@ class Normalize(object):
        self.mean = np.array(mean, dtype=np.float32)
        self.std = np.array(std, dtype=np.float32)
        self.to_bgr = to_bgr
+        self.adjust_magnitude = adjust_magnitude

    def __call__(self, results):
-        """Performs the Normalize augmentation.
-
-        Args:
-            results (dict): The resulting dict to be modified and passed
-                to the next transform in pipeline.
-        """
-        n = len(results['imgs'])
-        h, w, c = results['imgs'][0].shape
-        imgs = np.empty((n, h, w, c), dtype=np.float32)
-        for i, img in enumerate(results['imgs']):
-            imgs[i] = img
+        modality = results['modality']

-        for img in imgs:
-            mmcv.imnormalize_(img, self.mean, self.std, self.to_bgr)
+        if modality == 'RGB':
+            n = len(results['imgs'])
+            h, w, c = results['imgs'][0].shape
+            imgs = np.empty((n, h, w, c), dtype=np.float32)
+            for i, img in enumerate(results['imgs']):
+                imgs[i] = img

-        results['imgs'] = imgs
-        results['img_norm_cfg'] = dict(
-            mean=self.mean, std=self.std, to_bgr=self.to_bgr)
-        return results
+            for img in imgs:
+                mmcv.imnormalize_(img, self.mean, self.std, self.to_bgr)
+
+            results['imgs'] = imgs
+            results['img_norm_cfg'] = dict(
+                mean=self.mean, std=self.std, to_bgr=self.to_bgr)
+            return results
+        elif modality == 'Flow':
+            num_imgs = len(results['imgs'])
+            assert num_imgs % 2 == 0
+            assert self.mean.shape[0] == 2
+            assert self.std.shape[0] == 2
+            n = num_imgs // 2
+            h, w = results['imgs'][0].shape
+            x_flow = np.empty((n, h, w), dtype=np.float32)
+            y_flow = np.empty((n, h, w), dtype=np.float32)
+            for i in range(n):
+                x_flow[i] = results['imgs'][2 * i]
+                y_flow[i] = results['imgs'][2 * i + 1]
+            x_flow = (x_flow - self.mean[0]) / self.std[0]
+            y_flow = (y_flow - self.mean[1]) / self.std[1]
+            if self.adjust_magnitude:
+                x_flow = x_flow * results['scale_factor'][0]
+                y_flow = y_flow * results['scale_factor'][1]
+            imgs = np.stack([x_flow, y_flow], axis=-1)
+            results['imgs'] = imgs
+            args = dict(
+                mean=self.mean,
+                std=self.std,
+                to_bgr=self.to_bgr,
+                adjust_magnitude=self.adjust_magnitude)
+            results['img_norm_cfg'] = args
+            return results
+        else:
+            raise NotImplementedError

    def __repr__(self):
        repr_str = (f'{self.__class__.__name__}('
-                    f'mean={self.mean}, std={self.std}, to_bgr={self.to_bgr})')
+                    f'mean={self.mean}, '
+                    f'std={self.std}, '
+                    f'to_bgr={self.to_bgr}, '
+                    f'adjust_magnitude={self.adjust_magnitude})')
        return repr_str


@@ -696,7 +740,7 @@ class CenterCrop(object):

        if not self.lazy:
            results['imgs'] = [
-                img[top:bottom, left:right, :] for img in results['imgs']
+                img[top:bottom, left:right] for img in results['imgs']
            ]
        else:
            lazyop = results['lazy']
@@ -776,7 +820,7 @@ class ThreeCrop(object):
        for x_offset, y_offset in offsets:
            bbox = [x_offset, y_offset, x_offset + crop_w, y_offset + crop_h]
            crop = [
-                img[y_offset:y_offset + crop_h, x_offset:x_offset + crop_w, :]
+                img[y_offset:y_offset + crop_h, x_offset:x_offset + crop_w]
                for img in imgs
            ]
            cropped.extend(crop)
@@ -842,8 +886,8 @@ class TenCrop(object):
        crop_bboxes = list()
        for x_offset, y_offsets in offsets:
            crop = [
-                img[y_offsets:y_offsets + crop_h,
-                    x_offset:x_offset + crop_w, :] for img in imgs
+                img[y_offsets:y_offsets + crop_h, x_offset:x_offset + crop_w]
+                for img in imgs
            ]
            flip_crop = [np.flip(c, axis=1).copy() for c in crop]
            bbox = [x_offset, y_offsets, x_offset + crop_w, y_offsets + crop_h]
@@ -910,7 +954,7 @@ class MultiGroupCrop(object):

            bbox = [x_offset, y_offset, x_offset + crop_w, y_offset + crop_h]
            crop = [
-                img[y_offset:y_offset + crop_h, x_offset:x_offset + crop_w, :]
+                img[y_offset:y_offset + crop_h, x_offset:x_offset + crop_w]
                for img in imgs
            ]
            img_crops.extend(crop)

--- a/mmaction/datasets/pipelines/formating.py
+++ b/mmaction/datasets/pipelines/formating.py
@@ -229,7 +229,7 @@ class FormatShape(object):

    def __init__(self, input_format):
        self.input_format = input_format
-        if self.input_format not in ['NCTHW', 'NCHW']:
+        if self.input_format not in ['NCTHW', 'NCHW', 'NCHW_Flow']:
            raise ValueError(
                f'The input format {self.input_format} is invalid.')

@@ -257,6 +257,19 @@ class FormatShape(object):
        elif self.input_format == 'NCHW':
            imgs = np.transpose(imgs, (0, 3, 1, 2))
            # M x C x H x W
+        elif self.input_format == 'NCHW_Flow':
+            num_clips = results['num_clips']
+            clip_len = results['clip_len']
+            imgs = imgs.reshape((-1, num_clips, clip_len) + imgs.shape[1:])
+            # N_crops x N_clips x L x H x W x C
+            imgs = np.transpose(imgs, (0, 1, 2, 5, 3, 4))
+            # N_crops x N_clips x L x C x H x W
+            imgs = imgs.reshape((-1, imgs.shape[2] * imgs.shape[3]) +
+                                imgs.shape[4:])
+            # M' x C' x H x W
+            # M' = N_crops x N_clips
+            # C' = L x C
+
        results['imgs'] = imgs
        results['input_shape'] = imgs.shape
        return results

--- a/mmaction/datasets/pipelines/loading.py
+++ b/mmaction/datasets/pipelines/loading.py
@@ -574,6 +574,7 @@ class FrameSelector(object):

        directory = results['frame_dir']
        filename_tmpl = results['filename_tmpl']
+        modality = results['modality']

        if self.file_client is None:
            self.file_client = FileClient(self.io_backend, **self.kwargs)
@@ -588,11 +589,24 @@ class FrameSelector(object):
            # TODO: add offset attributes in datasets.
            if frame_idx == 0:
                frame_idx += 1
-            filepath = osp.join(directory, filename_tmpl.format(frame_idx))
-            img_bytes = self.file_client.get(filepath)
-            # Get frame with channel order RGB directly.
-            cur_frame = mmcv.imfrombytes(img_bytes, channel_order='rgb')
-            imgs.append(cur_frame)
+            if modality == 'RGB':
+                filepath = osp.join(directory, filename_tmpl.format(frame_idx))
+                img_bytes = self.file_client.get(filepath)
+                # Get frame with channel order RGB directly.
+                cur_frame = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+                imgs.append(cur_frame)
+            elif modality == 'Flow':
+                x_filepath = osp.join(directory,
+                                      filename_tmpl.format('x', frame_idx))
+                y_filepath = osp.join(directory,
+                                      filename_tmpl.format('y', frame_idx))
+                x_img_bytes = self.file_client.get(x_filepath)
+                x_frame = mmcv.imfrombytes(x_img_bytes, flag='grayscale')
+                y_img_bytes = self.file_client.get(y_filepath)
+                y_frame = mmcv.imfrombytes(y_img_bytes, flag='grayscale')
+                imgs.extend([x_frame, y_frame])
+            else:
+                raise NotImplementedError

        results['imgs'] = imgs
        results['original_shape'] = imgs[0].shape[:2]

--- a/mmaction/datasets/rawframe_dataset.py
+++ b/mmaction/datasets/rawframe_dataset.py
@@ -55,6 +55,8 @@ class RawframeDataset(BaseDataset):
        multi_class (bool): Determines whether it is a multi-class
            recognition dataset. Default: False.
        num_classes (int): Number of classes in the dataset. Default: None.
+        modality (str): Modality of data. Support 'RGB', 'Flow'.
+                            Default: 'Flow'.
    """

    def __init__(self,
@@ -64,10 +66,13 @@ class RawframeDataset(BaseDataset):
                 test_mode=False,
                 filename_tmpl='img_{:05}.jpg',
                 multi_class=False,
-                 num_classes=None):
+                 num_classes=None,
+                 modality='RGB'):
        super().__init__(ann_file, pipeline, data_prefix, test_mode,
                         multi_class, num_classes)
+        assert modality in ['RGB', 'Flow']
        self.filename_tmpl = filename_tmpl
+        self.modality = modality

    def load_annotations(self):
        video_infos = []
@@ -96,11 +101,13 @@ class RawframeDataset(BaseDataset):
    def prepare_train_frames(self, idx):
        results = copy.deepcopy(self.video_infos[idx])
        results['filename_tmpl'] = self.filename_tmpl
+        results['modality'] = self.modality
        return self.pipeline(results)

    def prepare_test_frames(self, idx):
        results = copy.deepcopy(self.video_infos[idx])
        results['filename_tmpl'] = self.filename_tmpl
+        results['modality'] = self.modality
        return self.pipeline(results)

    def evaluate(self,

--- a/tests/data/test_imgs/x_00001.jpg
+++ b/tests/data/test_imgs/x_00001.jpg
--- a/tests/data/test_imgs/x_00002.jpg
+++ b/tests/data/test_imgs/x_00002.jpg
--- a/tests/data/test_imgs/x_00003.jpg
+++ b/tests/data/test_imgs/x_00003.jpg
--- a/tests/data/test_imgs/x_00004.jpg
+++ b/tests/data/test_imgs/x_00004.jpg
--- a/tests/data/test_imgs/x_00005.jpg
+++ b/tests/data/test_imgs/x_00005.jpg
--- a/tests/data/test_imgs/y_00001.jpg
+++ b/tests/data/test_imgs/y_00001.jpg
--- a/tests/data/test_imgs/y_00002.jpg
+++ b/tests/data/test_imgs/y_00002.jpg
--- a/tests/data/test_imgs/y_00003.jpg
+++ b/tests/data/test_imgs/y_00003.jpg
--- a/tests/data/test_imgs/y_00004.jpg
+++ b/tests/data/test_imgs/y_00004.jpg
--- a/tests/data/test_imgs/y_00005.jpg
+++ b/tests/data/test_imgs/y_00005.jpg
--- a/tests/test_augmentations.py
+++ b/tests/test_augmentations.py
 import copy

+import mmcv
 import numpy as np
 import pytest
 from numpy.testing import assert_array_almost_equal
@@ -517,35 +518,48 @@ class TestAugumentations(object):
            # scale must be tuple of int
            Resize('224')

-        target_keys = ['imgs', 'img_shape', 'keep_ratio', 'scale_factor']
+        target_keys = [
+            'imgs', 'img_shape', 'keep_ratio', 'scale_factor', 'modality'
+        ]
+
+        # test resize for flow images
+        imgs = list(np.random.rand(2, 240, 320))
+        results = dict(imgs=imgs, modality='Flow')
+        resize = Resize(scale=(160, 80), keep_ratio=False)
+        resize_results = resize(results)
+        assert self.check_keys_contain(resize_results.keys(), target_keys)
+        assert np.all(resize_results['scale_factor'] == np.array(
+            [.5, 1. / 3.], dtype=np.float32))
+        assert resize_results['img_shape'] == (80, 160)

        # scale with -1 to indicate np.inf
        imgs = list(np.random.rand(2, 240, 320, 3))
-        results = dict(imgs=imgs)
+        results = dict(imgs=imgs, modality='RGB')
        resize = Resize(scale=(-1, 256), keep_ratio=True)
        resize_results = resize(results)
        assert self.check_keys_contain(resize_results.keys(), target_keys)
-        assert resize_results['scale_factor'] == 256 / 240
+        assert np.all(resize_results['scale_factor'] == np.array(
+            [341 / 320, 256 / 240], dtype=np.float32))
        assert resize_results['img_shape'] == (256, 341)

        # scale with a normal tuple (320, 320) to indicate np.inf
        imgs = list(np.random.rand(2, 240, 320, 3))
-        results = dict(imgs=imgs)
+        results = dict(imgs=imgs, modality='RGB')
        resize = Resize(scale=(320, 320), keep_ratio=False)
        resize_results = resize(results)
        assert self.check_keys_contain(resize_results.keys(), target_keys)
        assert np.all(resize_results['scale_factor'] == np.array(
-            [1, 320 / 240, 1, 320 / 240], dtype=np.float32))
+            [1, 320 / 240], dtype=np.float32))
        assert resize_results['img_shape'] == (320, 320)

        # scale with a normal tuple (341, 256) to indicate np.inf
        imgs = list(np.random.rand(2, 240, 320, 3))
-        results = dict(imgs=imgs)
+        results = dict(imgs=imgs, modality='RGB')
        resize = Resize(scale=(341, 256), keep_ratio=False)
        resize_results = resize(results)
        assert self.check_keys_contain(resize_results.keys(), target_keys)
        assert np.all(resize_results['scale_factor'] == np.array(
-            [341 / 320, 256 / 240, 341 / 320, 256 / 240], dtype=np.float32))
+            [341 / 320, 256 / 240], dtype=np.float32))
        assert resize_results['img_shape'] == (256, 341)

        assert repr(resize) == (
@@ -562,41 +576,44 @@ class TestAugumentations(object):
            # scale must be tuple of int
            Resize('224', lazy=True)

-        target_keys = ['imgs', 'img_shape', 'keep_ratio', 'scale_factor']
+        target_keys = [
+            'imgs', 'img_shape', 'keep_ratio', 'scale_factor', 'modality'
+        ]

        # scale with -1 to indicate np.inf
        imgs = list(np.random.rand(2, 240, 320, 3))
-        results = dict(imgs=imgs)
+        results = dict(imgs=imgs, modality='RGB')
        resize = Resize(scale=(-1, 256), keep_ratio=True, lazy=True)
        resize_results = resize(results)
        assert id(imgs) == id(resize_results['imgs'])
        assert self.check_keys_contain(resize_results.keys(), target_keys)
        resize_results_fuse = Fuse()(resize_results)
-        assert resize_results_fuse['scale_factor'] == 256 / 240
+        assert np.all(resize_results_fuse['scale_factor'] == np.array(
+            [341 / 320, 256 / 240], dtype=np.float32))
        assert resize_results_fuse['img_shape'] == (256, 341)

        # scale with a normal tuple (320, 320) to indicate np.inf
        imgs = list(np.random.rand(2, 240, 320, 3))
-        results = dict(imgs=imgs)
+        results = dict(imgs=imgs, modality='RGB')
        resize = Resize(scale=(320, 320), keep_ratio=False, lazy=True)
        resize_results = resize(results)
        assert id(imgs) == id(resize_results['imgs'])
        assert self.check_keys_contain(resize_results.keys(), target_keys)
        resize_results_fuse = Fuse()(resize_results)
        assert np.all(resize_results_fuse['scale_factor'] == np.array(
-            [1, 320 / 240, 1, 320 / 240], dtype=np.float32))
+            [1, 320 / 240], dtype=np.float32))
        assert resize_results_fuse['img_shape'] == (320, 320)

        # scale with a normal tuple (341, 256) to indicate np.inf
        imgs = list(np.random.rand(2, 240, 320, 3))
-        results = dict(imgs=imgs)
+        results = dict(imgs=imgs, modality='RGB')
        resize = Resize(scale=(341, 256), keep_ratio=False, lazy=True)
        resize_results = resize(results)
        assert id(imgs) == id(resize_results['imgs'])
        assert self.check_keys_contain(resize_results.keys(), target_keys)
        resize_results_fuse = Fuse()(resize_results)
        assert np.all(resize_results_fuse['scale_factor'] == np.array(
-            [341 / 320, 256 / 240, 341 / 320, 256 / 240], dtype=np.float32))
+            [341 / 320, 256 / 240], dtype=np.float32))
        assert resize_results_fuse['img_shape'] == (256, 341)

        assert repr(resize) == (f'{resize.__class__.__name__ }'
@@ -608,11 +625,11 @@ class TestAugumentations(object):
            # direction must be in ['horizontal', 'vertical']
            Flip(direction='vertically')

-        target_keys = ['imgs', 'flip_direction']
+        target_keys = ['imgs', 'flip_direction', 'modality']

        # do not flip imgs.
        imgs = list(np.random.rand(2, 64, 64, 3))
-        results = dict(imgs=copy.deepcopy(imgs))
+        results = dict(imgs=copy.deepcopy(imgs), modality='RGB')
        flip = Flip(flip_ratio=0, direction='horizontal')
        flip_results = flip(results)
        assert self.check_keys_contain(flip_results.keys(), target_keys)
@@ -622,7 +639,7 @@ class TestAugumentations(object):

        # always flip imgs horizontally.
        imgs = list(np.random.rand(2, 64, 64, 3))
-        results = dict(imgs=copy.deepcopy(imgs))
+        results = dict(imgs=copy.deepcopy(imgs), modality='RGB')
        flip = Flip(flip_ratio=1, direction='horizontal')
        flip_results = flip(results)
        assert self.check_keys_contain(flip_results.keys(), target_keys)
@@ -632,9 +649,31 @@ class TestAugumentations(object):
        assert id(flip_results['imgs']) == id(results['imgs'])
        assert np.shape(flip_results['imgs']) == np.shape(imgs)

+        # flip flow images horizontally
+        imgs = [
+            np.arange(16).reshape(4, 4).astype(np.float32),
+            np.arange(16, 32).reshape(4, 4).astype(np.float32)
+        ]
+        results = dict(imgs=copy.deepcopy(imgs), modality='Flow')
+        flip = Flip(flip_ratio=1, direction='horizontal')
+        flip_results = flip(results)
+        assert self.check_keys_contain(flip_results.keys(), target_keys)
+        imgs = [x.reshape(4, 4, 1) for x in imgs]
+        flip_results['imgs'] = [
+            x.reshape(4, 4, 1) for x in flip_results['imgs']
+        ]
+        if flip_results['flip'] is True:
+            assert self.check_flip([imgs[0]],
+                                   [mmcv.iminvert(flip_results['imgs'][0])],
+                                   flip_results['flip_direction'])
+            assert self.check_flip([imgs[1]], [flip_results['imgs'][1]],
+                                   flip_results['flip_direction'])
+        assert id(flip_results['imgs']) == id(results['imgs'])
+        assert np.shape(flip_results['imgs']) == np.shape(imgs)
+
        # always flip imgs vertivally.
        imgs = list(np.random.rand(2, 64, 64, 3))
-        results = dict(imgs=copy.deepcopy(imgs))
+        results = dict(imgs=copy.deepcopy(imgs), modality='RGB')
        flip = Flip(flip_ratio=1, direction='vertical')
        flip_results = flip(results)
        assert self.check_keys_contain(flip_results.keys(), target_keys)
@@ -652,12 +691,12 @@ class TestAugumentations(object):
        with pytest.raises(ValueError):
            Flip(direction='vertically', lazy=True)

-        target_keys = ['imgs', 'flip_direction']
+        target_keys = ['imgs', 'flip_direction', 'modality']

        # do not flip imgs.
        imgs = list(np.random.rand(2, 64, 64, 3))
        imgs_tmp = imgs.copy()
-        results = dict(imgs=imgs_tmp)
+        results = dict(imgs=imgs_tmp, modality='RGB')
        flip = Flip(flip_ratio=0, direction='horizontal', lazy=True)
        flip_results = flip(results)
        assert id(imgs_tmp) == id(flip_results['imgs'])
@@ -670,7 +709,7 @@ class TestAugumentations(object):
        # always flip imgs horizontally.
        imgs = list(np.random.rand(2, 64, 64, 3))
        imgs_tmp = imgs.copy()
-        results = dict(imgs=imgs_tmp)
+        results = dict(imgs=imgs_tmp, modality='RGB')
        flip = Flip(flip_ratio=1, direction='horizontal', lazy=True)
        flip_results = flip(results)
        assert id(imgs_tmp) == id(flip_results['imgs'])
@@ -684,7 +723,7 @@ class TestAugumentations(object):
        # always flip imgs vertivally.
        imgs = list(np.random.rand(2, 64, 64, 3))
        imgs_tmp = imgs.copy()
-        results = dict(imgs=imgs_tmp)
+        results = dict(imgs=imgs_tmp, modality='RGB')
        flip = Flip(flip_ratio=1, direction='vertical', lazy=True)
        flip_results = flip(results)
        assert id(imgs_tmp) == id(flip_results['imgs'])
@@ -710,11 +749,11 @@ class TestAugumentations(object):
            Normalize([123.675, 116.28, 103.53],
                      dict(std=[58.395, 57.12, 57.375]))

-        target_keys = ['imgs', 'img_norm_cfg']
+        target_keys = ['imgs', 'img_norm_cfg', 'modality']

        # normalize imgs in RGB format
        imgs = list(np.random.rand(2, 240, 320, 3).astype(np.float32))
-        results = dict(imgs=imgs)
+        results = dict(imgs=imgs, modality='RGB')
        config = dict(
            mean=[123.675, 116.28, 103.53],
            std=[58.395, 57.12, 57.375],
@@ -725,9 +764,24 @@ class TestAugumentations(object):
        self.check_normalize(imgs, normalize_results['imgs'],
                             normalize_results['img_norm_cfg'])

+        # normalize flow imgs
+        imgs = list(np.random.rand(4, 240, 320).astype(np.float32))
+        results = dict(imgs=imgs, modality='Flow')
+        config = dict(mean=[128, 128], std=[128, 128])
+        normalize = Normalize(**config)
+        normalize_results = normalize(results)
+        assert self.check_keys_contain(normalize_results.keys(), target_keys)
+        assert normalize_results['imgs'].shape == (2, 240, 320, 2)
+        x_components = np.array(imgs[0::2])
+        y_components = np.array(imgs[1::2])
+        x_components = (x_components - config['mean'][0]) / config['std'][0]
+        y_components = (y_components - config['mean'][1]) / config['std'][1]
+        result_imgs = np.stack([x_components, y_components], axis=-1)
+        assert np.all(np.isclose(result_imgs, normalize_results['imgs']))
+
        # normalize imgs in BGR format
        imgs = list(np.random.rand(2, 240, 320, 3).astype(np.float32))
-        results = dict(imgs=imgs)
+        results = dict(imgs=imgs, modality='RGB')
        config = dict(
            mean=[123.675, 116.28, 103.53],
            std=[58.395, 57.12, 57.375],
@@ -741,7 +795,8 @@ class TestAugumentations(object):
        assert normalize.__repr__() == (
            normalize.__class__.__name__ +
            f'(mean={np.array([123.675, 116.28, 103.53])}, ' +
-            f'std={np.array([58.395, 57.12, 57.375])}, to_bgr={True})')
+            f'std={np.array([58.395, 57.12, 57.375])}, to_bgr={True}, '
+            f'adjust_magnitude={False})')

    def test_center_crop(self):
        with pytest.raises(TypeError):

--- a/tests/test_loading.py
+++ b/tests/test_loading.py
 import copy
-import os
 import os.path as osp

 import numpy as np
@@ -33,13 +32,21 @@ class TestLoading(object):
            osp.dirname(__file__), 'data/test_bsp_features')
        cls.proposals_dir = osp.join(
            osp.dirname(__file__), 'data/test_proposals')
-        cls.total_frames = len(os.listdir(cls.img_dir))
+        cls.total_frames = 5
        cls.filename_tmpl = 'img_{:05}.jpg'
+        cls.flow_filename_tmpl = '{}_{:05d}.jpg'
        cls.video_results = dict(filename=cls.video_path, label=1)
        cls.frame_results = dict(
            frame_dir=cls.img_dir,
            total_frames=cls.total_frames,
            filename_tmpl=cls.filename_tmpl,
+            modality='RGB',
+            label=1)
+        cls.flow_frame_results = dict(
+            frame_dir=cls.img_dir,
+            total_frames=cls.total_frames,
+            filename_tmpl=cls.flow_filename_tmpl,
+            modality='Flow',
            label=1)
        cls.action_results = dict(
            video_name='v_test1',
@@ -537,7 +544,7 @@ class TestLoading(object):
            video_result['frame_inds']), 256, 340, 3)

    def test_frame_selector(self):
-        target_keys = ['frame_inds', 'imgs', 'original_shape']
+        target_keys = ['frame_inds', 'imgs', 'original_shape', 'modality']

        # test frame selector with 2 dim input
        inputs = copy.deepcopy(self.frame_results)
@@ -570,6 +577,16 @@ class TestLoading(object):
                                             320, 3)
        assert results['original_shape'] == (240, 320)

+        # test frame selector with 1 dim input for flow images
+        inputs = copy.deepcopy(self.flow_frame_results)
+        inputs['frame_inds'] = np.arange(1, self.total_frames, 2)
+        frame_selector = FrameSelector(io_backend='disk')
+        results = frame_selector(inputs)
+        assert self.check_keys_contain(results.keys(), target_keys)
+        assert np.shape(results['imgs']) == (len(inputs['frame_inds']) * 2,
+                                             240, 320)
+        assert results['original_shape'] == (240, 320)
+
        # test frame selector in turbojpeg decording backend
        inputs = copy.deepcopy(self.frame_results)
        inputs['frame_inds'] = np.arange(1, self.total_frames, 5)