# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import cv2 import math import random import numpy as np from PIL import Image, ImageEnhance import logging from paddle.io import Dataset logger = logging.getLogger(__name__) __all__ = ['KineticsDataset'] class KineticsDataset(Dataset): def __init__(self, mode, cfg): self.mode = mode self.format = cfg.MODEL.format self.num_frames = cfg.MODEL.num_frames self.sampling_rate = cfg.MODEL.sampling_rate self.target_fps = cfg.MODEL.target_fps self.slowfast_alpha = cfg.MODEL.alpha self.target_size = cfg[mode.upper()]['target_size'] self.img_mean = cfg.MODEL.image_mean self.img_std = cfg.MODEL.image_std self.filelist = cfg[mode.upper()]['filelist'] if self.mode in ["train", "valid"]: self.min_size = cfg[mode.upper()]['min_size'] self.max_size = cfg[mode.upper()]['max_size'] self.num_ensemble_views = 1 self.num_spatial_crops = 1 self._num_clips = 1 elif self.mode in ['test', 'infer']: self.min_size = self.max_size = self.target_size self.num_ensemble_views = cfg.TEST.num_ensemble_views self.num_spatial_crops = cfg.TEST.num_spatial_crops self._num_clips = (self.num_ensemble_views * self.num_spatial_crops) self._construct_loader() def _construct_loader(self): """ Construct the video loader. """ self._num_retries = 5 self._path_to_videos = [] self._labels = [] self._spatial_temporal_idx = [] with open(self.filelist, "r") as f: for clip_idx, path_label in enumerate(f.read().splitlines()): if self.mode == 'infer': path = path_label label = 0 # without label when infer actually else: path, label = path_label.split() for idx in range(self._num_clips): self._path_to_videos.append(path) self._labels.append(int(label)) self._spatial_temporal_idx.append(idx) def __len__(self): return len(self._path_to_videos) def __getitem__(self, idx): if self.mode in ["train", "valid"]: temporal_sample_index = -1 spatial_sample_index = -1 elif self.mode in ["test", 'infer']: temporal_sample_index = (self._spatial_temporal_idx[idx] // self.num_spatial_crops) spatial_sample_index = (self._spatial_temporal_idx[idx] % self.num_spatial_crops) for ir in range(self._num_retries): mp4_path = self._path_to_videos[idx] try: pathways = self.mp4_loader( mp4_path, temporal_sample_index, spatial_sample_index, temporal_num_clips=self.num_ensemble_views, spatial_num_clips=self.num_spatial_crops, num_frames=self.num_frames, sampling_rate=self.sampling_rate, target_fps=self.target_fps, target_size=self.target_size, img_mean=self.img_mean, img_std=self.img_std, slowfast_alpha=self.slowfast_alpha, min_size=self.min_size, max_size=self.max_size) except: if ir < self._num_retries - 1: logger.error( 'Error when loading {}, have {} trys, will try again'. format(mp4_path, ir)) idx = random.randint(0, len(self._path_to_videos) - 1) continue else: logger.error( 'Error when loading {}, have {} trys, will not try again'. format(mp4_path, ir)) return None, None label = self._labels[idx] return pathways[0], pathways[1], np.array([label]), np.array([idx]) def mp4_loader(self, filepath, temporal_sample_index, spatial_sample_index, temporal_num_clips, spatial_num_clips, num_frames, sampling_rate, target_fps, target_size, img_mean, img_std, slowfast_alpha, min_size, max_size): frames_sample, clip_size = self.decode_sampling( filepath, temporal_sample_index, temporal_num_clips, num_frames, sampling_rate, target_fps) frames_select = self.temporal_sampling( frames_sample, clip_size, num_frames, filepath, temporal_sample_index, temporal_num_clips) frames_resize = self.scale(frames_select, min_size, max_size) frames_crop = self.crop(frames_resize, target_size, spatial_sample_index, spatial_num_clips) frames_flip = self.flip(frames_crop, spatial_sample_index) #list to nparray npframes = (np.stack(frames_flip)).astype('float32') npframes_norm = self.color_norm(npframes, img_mean, img_std) frames_list = self.pack_output(npframes_norm, slowfast_alpha) return frames_list def get_start_end_idx(self, video_size, clip_size, clip_idx, temporal_num_clips): delta = max(video_size - clip_size, 0) if clip_idx == -1: # when test, temporal_num_clips is not used # Random temporal sampling. start_idx = random.uniform(0, delta) else: # Uniformly sample the clip with the given index. start_idx = delta * clip_idx / temporal_num_clips end_idx = start_idx + clip_size - 1 return start_idx, end_idx def decode_sampling(self, filepath, temporal_sample_index, temporal_num_clips, num_frames, sampling_rate, target_fps): cap = cv2.VideoCapture(filepath) videolen = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) (major_ver, minor_ver, subminor_ver) = (cv2.__version__).split('.') if int(major_ver) < 3: fps = cap.get(cv2.cv.CV_CAP_PROP_FPS) else: fps = cap.get(cv2.CAP_PROP_FPS) clip_size = num_frames * sampling_rate * fps / target_fps if filepath[-3:] != 'mp4': start_idx, end_idx = 0, math.inf else: start_idx, end_idx = self.get_start_end_idx( videolen, clip_size, temporal_sample_index, temporal_num_clips) #print("filepath:",filepath,"start_idx:",start_idx,"end_idx:",end_idx) frames_sample = [] #start randomly, decode clip size start_idx = math.ceil(start_idx) cap.set(cv2.CAP_PROP_POS_FRAMES, start_idx) for i in range(videolen): if i < start_idx: continue ret, frame = cap.read() if ret == False: continue if i <= end_idx + 1: #buffer img = frame[:, :, ::-1] #BGR -> RGB frames_sample.append(img) else: break return frames_sample, clip_size def temporal_sampling(self, frames_sample, clip_size, num_frames, filepath, temporal_sample_index, temporal_num_clips): """ sample num_frames from clip_size """ fs_len = len(frames_sample) if filepath[-3:] != 'mp4': start_idx, end_idx = self.get_start_end_idx( fs_len, clip_size, temporal_sample_index, temporal_num_clips) else: start_idx, end_idx = self.get_start_end_idx(fs_len, clip_size, 0, 1) index = np.linspace(start_idx, end_idx, num_frames).astype("int64") index = np.clip(index, 0, fs_len - 1) frames_select = [] for i in range(index.shape[0]): idx = index[i] imgbuf = frames_sample[idx] img = Image.fromarray(imgbuf, mode='RGB') frames_select.append(img) return frames_select def scale(self, frames_select, min_size, max_size): size = int(round(np.random.uniform(min_size, max_size))) assert (len(frames_select) >= 1) , \ "len(frames_select):{} should be larger than 1".format(len(frames_select)) width, height = frames_select[0].size if (width <= height and width == size) or (height <= width and height == size): return frames_select new_width = size new_height = size if width < height: new_height = int(math.floor((float(height) / width) * size)) else: new_width = int(math.floor((float(width) / height) * size)) frames_resize = [] for j in range(len(frames_select)): img = frames_select[j] scale_img = img.resize((new_width, new_height), Image.BILINEAR) frames_resize.append(scale_img) return frames_resize def crop(self, frames_resize, target_size, spatial_sample_index, spatial_num_clips): w, h = frames_resize[0].size if w == target_size and h == target_size: return frames_resize assert (w >= target_size) and (h >= target_size), \ "image width({}) and height({}) should be larger than crop size({},{})".format(w, h, target_size, target_size) frames_crop = [] if spatial_sample_index == -1: x_offset = random.randint(0, w - target_size) y_offset = random.randint(0, h - target_size) else: x_gap = int(math.ceil((w - target_size) / (spatial_num_clips - 1))) y_gap = int(math.ceil((h - target_size) / (spatial_num_clips - 1))) if h > w: x_offset = int(math.ceil((w - target_size) / 2)) if spatial_sample_index == 0: y_offset = 0 elif spatial_sample_index == spatial_num_clips - 1: y_offset = h - target_size else: y_offset = y_gap * spatial_sample_index else: y_offset = int(math.ceil((h - target_size) / 2)) if spatial_sample_index == 0: x_offset = 0 elif spatial_sample_index == spatial_num_clips - 1: x_offset = w - target_size else: x_offset = x_gap * spatial_sample_index for img in frames_resize: nimg = img.crop((x_offset, y_offset, x_offset + target_size, y_offset + target_size)) frames_crop.append(nimg) return frames_crop def flip(self, frames_crop, spatial_sample_index): # without flip when test if spatial_sample_index != -1: return frames_crop frames_flip = [] if np.random.uniform() < 0.5: for img in frames_crop: nimg = img.transpose(Image.FLIP_LEFT_RIGHT) frames_flip.append(nimg) else: frames_flip = frames_crop return frames_flip def color_norm(self, npframes_norm, c_mean, c_std): npframes_norm /= 255.0 npframes_norm -= np.array(c_mean).reshape( [1, 1, 1, 3]).astype(np.float32) npframes_norm /= np.array(c_std).reshape( [1, 1, 1, 3]).astype(np.float32) return npframes_norm def pack_output(self, npframes_norm, slowfast_alpha): fast_pathway = npframes_norm # sample num points between start and end slow_idx_start = 0 slow_idx_end = fast_pathway.shape[0] - 1 slow_idx_num = fast_pathway.shape[0] // slowfast_alpha slow_idxs_select = np.linspace(slow_idx_start, slow_idx_end, slow_idx_num).astype("int64") slow_pathway = fast_pathway[slow_idxs_select] # T H W C -> C T H W. slow_pathway = slow_pathway.transpose(3, 0, 1, 2) fast_pathway = fast_pathway.transpose(3, 0, 1, 2) # slow + fast frames_list = [slow_pathway, fast_pathway] return frames_list