import time import math import cv2 import numpy as np import paddle.fluid as fluid from paddle.fluid import dygraph from pytracking.tracker.base.basetracker import BaseTracker from ltr.data.anchor import Anchors class SiamMask(BaseTracker): def initialize_features(self): if not getattr(self, 'features_initialized', False): self.params.features.initialize() self.features_initialized = True def initialize(self, image, state, *args, **kwargs): # Initialize some stuff self.frame_num = 1 # Initialize features self.initialize_features() self.time = 0 tic = time.time() # Get position and size # self.pos: target center (y, x) self.pos = np.array( [ state[1] + state[3] // 2, state[0] + state[2] // 2 ], dtype=np.float32) self.target_sz = np.array([state[3], state[2]], dtype=np.float32) # Set search area context = self.params.context_amount * np.sum(self.target_sz) self.z_sz = np.sqrt(np.prod(self.target_sz + context)) self.x_sz = round(self.z_sz * (self.params.instance_size / self.params.exemplar_size)) self.score_size = (self.params.instance_size - self.params.exemplar_size) // \ self.params.anchor_stride + 1 + self.params.base_size self.anchor_num = len(self.params.anchor_ratios) * len(self.params.anchor_scales) hanning = np.hanning(self.score_size) window = np.outer(hanning, hanning) self.window = np.tile(window.flatten(), self.anchor_num) self.anchors = self.generate_anchor(self.score_size) # Convert image self.avg_color = np.mean(image, axis=(0, 1)) with dygraph.guard(): exemplar_image = self._crop_and_resize( image, self.pos, self.z_sz, out_size=self.params.exemplar_size, pad_color=self.avg_color) # get template self.params.features.features[0].net.template(exemplar_image) self.time += time.time() - tic def track(self, image): self.frame_num += 1 # Convert image image = np.asarray(image) with dygraph.guard(): # search images instance_image = self._crop_and_resize( image, self.pos, self.x_sz, out_size=self.params.instance_size, pad_color=self.avg_color) instance_box = [ self.pos[1] - self.x_sz / 2, self.pos[0] - self.x_sz / 2, self.x_sz, self.x_sz] # predict output = self.params.features.features[0].net.track(instance_image) score = self._convert_score(output['cls']) pred_bbox = self._convert_bbox(output['loc'], self.anchors) def change(r): return np.maximum(r, 1. / r) def sz(w, h): pad = (w + h) * 0.5 return np.sqrt((w + pad) * (h + pad)) # scale penalty scale_z = self.params.exemplar_size / self.z_sz s_c = change(sz(pred_bbox[2, :], pred_bbox[3, :]) / (sz(self.target_sz[1]*scale_z, self.target_sz[0]*scale_z))) # aspect ratio penalty r_c = change((self.target_sz[1]/self.target_sz[0]) / (pred_bbox[2, :]/pred_bbox[3, :])) penalty = np.exp(-(r_c * s_c - 1) * self.params.penalty_k) pscore = penalty * score # window penalty pscore = pscore * (1 - self.params.window_influence) + \ self.window * self.params.window_influence best_idx = np.argmax(pscore) bbox = pred_bbox[:, best_idx] / scale_z lr = penalty[best_idx] * score[best_idx] * self.params.lr cx = bbox[0] + self.pos[1] cy = bbox[1] + self.pos[0] # smooth bbox width = self.target_sz[1] * (1 - lr) + bbox[2] * lr height = self.target_sz[0] * (1 - lr) + bbox[3] * lr # clip boundary cx, cy, width, height = self._bbox_clip(cx, cy, width, height, image.shape[:2]) # update state self.pos = np.array([cy, cx]) self.target_sz = np.array([height, width]) context = self.params.context_amount * np.sum(self.target_sz) self.z_sz = np.sqrt(np.prod(self.target_sz + context)) self.x_sz = round(self.z_sz * (self.params.instance_size / self.params.exemplar_size)) if self.params.features.features[0].net.refine_head is None or not self.params.polygon: # Return new state yx = self.pos - self.target_sz / 2 new_state = np.array([yx[1], yx[0], self.target_sz[1], self.target_sz[0]], 'float32') return new_state.tolist() # processing mask pos = np.unravel_index(best_idx, (5, self.score_size, self.score_size)) delta_x, delta_y = int(pos[2]), int(pos[1]) with dygraph.guard(): mask = self.params.features.features[0].net.mask_refine((delta_y, delta_x)) mask = fluid.layers.sigmoid(mask) mask = fluid.layers.reshape(mask, [-1]) out_size = self.params.mask_output_size mask = fluid.layers.reshape(mask,[out_size, out_size]).numpy() s = instance_box[2] / self.params.instance_size base_size = self.params.base_size stride = self.params.anchor_stride sub_box = [instance_box[0] + (delta_x - base_size/2) * stride * s, instance_box[1] + (delta_y - base_size/2) * stride * s, s * self.params.exemplar_size, s * self.params.exemplar_size] s = out_size / sub_box[2] im_h, im_w = image.shape[:2] back_box = [-sub_box[0] * s, -sub_box[1] * s, im_w*s, im_h*s] mask_in_img = self._crop_back(mask, back_box, (im_w, im_h)) polygon = self._mask_post_processing(mask_in_img) # Return new state new_state = polygon.flatten() return new_state.tolist() def generate_anchor(self, score_size): anchors = Anchors( self.params.anchor_stride, self.params.anchor_ratios, self.params.anchor_scales) anchor = anchors.anchors x1, y1, x2, y2 = anchor[:, 0], anchor[:, 1], anchor[:, 2], anchor[:, 3] anchor = np.stack([(x1+x2)*0.5, (y1+y2)*0.5, x2-x1, y2-y1], 1) total_stride = anchors.stride anchor_num = anchor.shape[0] anchor = np.tile(anchor, score_size * score_size).reshape((-1, 4)) ori = - (score_size // 2) * total_stride xx, yy = np.meshgrid( [ori + total_stride * dx for dx in range(score_size)], [ori + total_stride * dy for dy in range(score_size)]) xx, yy = np.tile(xx.flatten(), (anchor_num, 1)).flatten(), \ np.tile(yy.flatten(), (anchor_num, 1)).flatten() anchor[:, 0], anchor[:, 1] = xx.astype(np.float32), yy.astype(np.float32) return anchor def _crop_and_resize(self, image, center, size, out_size, pad_color): # convert box to corners (0-indexed) size = round(size) corners = np.concatenate( ( np.floor(center - (size + 1) / 2 + 0.5), np.floor(center - (size + 1) / 2 + 0.5) + size )) corners = np.round(corners).astype(int) # pad image if necessary pads = np.concatenate((-corners[:2], corners[2:] - image.shape[:2])) npad = max(0, int(pads.max())) if npad > 0: image = cv2.copyMakeBorder( image, npad, npad, npad, npad, cv2.BORDER_CONSTANT, value=pad_color) # crop image patch corners = (corners + npad).astype(int) patch = image[corners[0]:corners[2], corners[1]:corners[3]] # resize to out_size patch = cv2.resize(patch, (out_size, out_size)) patch = patch.transpose(2, 0, 1) patch = patch[np.newaxis, :, :, :] patch = patch.astype(np.float32) patch = fluid.dygraph.to_variable(patch) return patch def _convert_bbox(self, delta, anchor): delta = fluid.layers.transpose(delta, [1, 2, 3, 0]) delta = fluid.layers.reshape(delta, [4, -1]).numpy() delta[0, :] = delta[0, :] * anchor[:, 2] + anchor[:, 0] delta[1, :] = delta[1, :] * anchor[:, 3] + anchor[:, 1] delta[2, :] = np.exp(delta[2, :]) * anchor[:, 2] delta[3, :] = np.exp(delta[3, :]) * anchor[:, 3] return delta def _convert_score(self, score): score = fluid.layers.transpose(score, [1, 2, 3, 0]) score = fluid.layers.reshape(score, [2, -1]) score = fluid.layers.transpose(score, [1, 0]) score = fluid.layers.softmax(score, axis=1)[:, 1].numpy() return score def _bbox_clip(self, cx, cy, width, height, boundary): cx = max(0, min(cx, boundary[1])) cy = max(0, min(cy, boundary[0])) width = max(10, min(width, boundary[1])) height = max(10, min(height, boundary[0])) return cx, cy, width, height def _crop_back(self, image, bbox, out_sz, padding=0): a = (out_sz[0] - 1) / bbox[2] b = (out_sz[1] - 1) / bbox[3] c = -a * bbox[0] d = -b * bbox[1] mapping = np.array([[a, 0, c], [0, b, d]]).astype(np.float) crop = cv2.warpAffine( image, mapping, (out_sz[0], out_sz[1]), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_CONSTANT, borderValue=padding) return crop def _mask_post_processing(self, mask): target_mask = (mask > self.params.mask_threshold) target_mask = target_mask.astype(np.uint8) if cv2.__version__[-5] == '4': contours, _ = cv2.findContours( target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) else: _, contours, _ = cv2.findContours( target_mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE) cnt_area = [cv2.contourArea(cnt) for cnt in contours] if len(contours) != 0 and np.max(cnt_area) > 100: contour = contours[np.argmax(cnt_area)] polygon = contour.reshape(-1, 2) prbox = cv2.boxPoints(cv2.minAreaRect(polygon)) rbox_in_img = prbox else: # empty mask yx = self.pos - self.target_sz / 2 location = np.array([yx[1], yx[0], self.target_sz[1], self.target_sz[0]], 'float32') rbox_in_img = np.array( [ [location[0], location[1]], [location[0] + location[2], location[1]], [location[0] + location[2], location[1] + location[3]], [location[0], location[1] + location[3]] ]) return rbox_in_img