import numpy as np from ltr.data import transforms import ltr.data.processing_utils as prutils from pytracking.libs import TensorDict class BaseProcessing: """ Base class for Processing. Processing class is used to process the data returned by a dataset, before passing it through the network. For example, it can be used to crop a search region around the object, apply various data augmentations, etc.""" def __init__(self, transform=transforms.ToArray(), train_transform=None, test_transform=None, joint_transform=None): """ args: transform - The set of transformations to be applied on the images. Used only if train_transform or test_transform is None. train_transform - The set of transformations to be applied on the train images. If None, the 'transform' argument is used instead. test_transform - The set of transformations to be applied on the test images. If None, the 'transform' argument is used instead. joint_transform - The set of transformations to be applied 'jointly' on the train and test images. For example, it can be used to convert both test and train images to grayscale. """ self.transform = { 'train': transform if train_transform is None else train_transform, 'test': transform if test_transform is None else test_transform, 'joint': joint_transform } def __call__(self, data: TensorDict): raise NotImplementedError class SiamFCProcessing(BaseProcessing): def __init__(self, search_area_factor, output_sz, center_jitter_factor, scale_jitter_factor, mode='pair', scale_type='context', border_type='meanpad', *args, **kwargs): super().__init__(*args, **kwargs) self.search_area_factor = search_area_factor self.output_sz = output_sz self.center_jitter_factor = center_jitter_factor self.scale_jitter_factor = scale_jitter_factor self.mode = mode self.scale_type = scale_type self.border_type = border_type def _get_jittered_box(self, box, mode, rng): jittered_size = box[2:4] * np.exp( rng.randn(2) * self.scale_jitter_factor[mode]) max_offset = (np.sqrt(jittered_size.prod()) * self.center_jitter_factor[mode]) jittered_center = box[0:2] + 0.5 * box[2:4] + max_offset * (rng.rand(2) - 0.5) return np.concatenate( (jittered_center - 0.5 * jittered_size, jittered_size), axis=0) def __call__(self, data: TensorDict, rng=None): # Apply joint transforms if self.transform['joint'] is not None: num_train_images = len(data['train_images']) all_images = data['train_images'] + data['test_images'] all_images_trans = self.transform['joint'](*all_images) data['train_images'] = all_images_trans[:num_train_images] data['test_images'] = all_images_trans[num_train_images:] for s in ['train', 'test']: assert self.mode == 'sequence' or len(data[s + '_images']) == 1, \ "In pair mode, num train/test frames must be 1" # Add a uniform noise to the center pos jittered_anno = [ self._get_jittered_box(a, s, rng) for a in data[s + '_anno'] ] # Crop image region centered at jittered_anno box try: crops, boxes = prutils.jittered_center_crop( data[s + '_images'], jittered_anno, data[s + '_anno'], self.search_area_factor[s], self.output_sz[s], scale_type=self.scale_type, border_type=self.border_type) except Exception as e: print('{}, anno: {}'.format(data['dataset'], data[s + '_anno'])) raise e # Apply transforms data[s + '_images'] = [self.transform[s](x) for x in crops] data[s + '_anno'] = boxes # Prepare output if self.mode == 'sequence': data = data.apply(prutils.stack_tensors) else: data = data.apply(lambda x: x[0] if isinstance(x, list) else x) return data class ATOMProcessing(BaseProcessing): """ The processing class used for training ATOM. The images are processed in the following way. First, the target bounding box is jittered by adding some noise. Next, a square region (called search region ) centered at the jittered target center, and of area search_area_factor^2 times the area of the jittered box is cropped from the image. The reason for jittering the target box is to avoid learning the bias that the target is always at the center of the search region. The search region is then resized to a fixed size given by the argument output_sz. A set of proposals are then generated for the test images by jittering the ground truth box. """ def __init__(self, search_area_factor, output_sz, center_jitter_factor, scale_jitter_factor, proposal_params, mode='pair', *args, **kwargs): """ args: search_area_factor - The size of the search region relative to the target size. output_sz - An integer, denoting the size to which the search region is resized. The search region is always square. center_jitter_factor - A dict containing the amount of jittering to be applied to the target center before extracting the search region. See _get_jittered_box for how the jittering is done. scale_jitter_factor - A dict containing the amount of jittering to be applied to the target size before extracting the search region. See _get_jittered_box for how the jittering is done. proposal_params - Arguments for the proposal generation process. See _generate_proposals for details. mode - Either 'pair' or 'sequence'. If mode='sequence', then output has an extra dimension for frames """ super().__init__(*args, **kwargs) self.search_area_factor = search_area_factor self.output_sz = output_sz self.center_jitter_factor = center_jitter_factor self.scale_jitter_factor = scale_jitter_factor self.proposal_params = proposal_params self.mode = mode def _get_jittered_box(self, box, mode, rng): """ Jitter the input box args: box - input bounding box mode - string 'train' or 'test' indicating train or test data returns: Variable - jittered box """ jittered_size = box[2:4] * np.exp( rng.randn(2) * self.scale_jitter_factor[mode]) max_offset = (np.sqrt(jittered_size.prod()) * self.center_jitter_factor[mode]) jittered_center = box[0:2] + 0.5 * box[2:4] + max_offset * (rng.rand(2) - 0.5) return np.concatenate( (jittered_center - 0.5 * jittered_size, jittered_size), axis=0) def _generate_proposals(self, box, rng): """ Generates proposals by adding noise to the input box args: box - input box returns: array - Array of shape (num_proposals, 4) containing proposals array - Array of shape (num_proposals,) containing IoU overlap of each proposal with the input box. The IoU is mapped to [-1, 1] """ # Generate proposals num_proposals = self.proposal_params['boxes_per_frame'] proposals = np.zeros((num_proposals, 4)) gt_iou = np.zeros(num_proposals) for i in range(num_proposals): proposals[i, :], gt_iou[i] = prutils.perturb_box( box, min_iou=self.proposal_params['min_iou'], sigma_factor=self.proposal_params['sigma_factor'], rng=rng) # Map to [-1, 1] gt_iou = gt_iou * 2 - 1 return proposals, gt_iou def __call__(self, data: TensorDict, rng=None): """ args: data - The input data, should contain the following fields: 'train_images' - 'test_images' - 'train_anno' - 'test_anno' - returns: TensorDict - output data block with following fields: 'train_images' - 'test_images' - 'train_anno' - 'test_anno' - 'test_proposals'- 'proposal_iou' - """ # Apply joint transforms if self.transform['joint'] is not None: num_train_images = len(data['train_images']) all_images = data['train_images'] + data['test_images'] all_images_trans = self.transform['joint'](*all_images) data['train_images'] = all_images_trans[:num_train_images] data['test_images'] = all_images_trans[num_train_images:] for s in ['train', 'test']: assert self.mode == 'sequence' or len(data[s + '_images']) == 1, \ "In pair mode, num train/test frames must be 1" # Add a uniform noise to the center pos jittered_anno = [ self._get_jittered_box(a, s, rng) for a in data[s + '_anno'] ] # Crop image region centered at jittered_anno box try: crops, boxes = prutils.jittered_center_crop( data[s + '_images'], jittered_anno, data[s + '_anno'], self.search_area_factor, self.output_sz) except Exception as e: print('{}, anno: {}'.format(data['dataset'], data[s + '_anno'])) raise e # Apply transforms data[s + '_images'] = [self.transform[s](x) for x in crops] data[s + '_anno'] = boxes # Generate proposals frame2_proposals, gt_iou = zip( * [self._generate_proposals(a, rng) for a in data['test_anno']]) data['test_proposals'] = list(frame2_proposals) data['proposal_iou'] = list(gt_iou) # Prepare output if self.mode == 'sequence': data = data.apply(prutils.stack_tensors) else: data = data.apply(lambda x: x[0] if isinstance(x, list) else x) return data