# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register from ppdet.modeling import ops from functools import partial __all__ = ['FCOSLoss', 'FCOSLossMILC', 'FCOSLossCR'] def flatten_tensor(inputs, channel_first=False): """ Flatten a Tensor Args: inputs (Tensor): 4-D Tensor with shape [N, C, H, W] or [N, H, W, C] channel_first (bool): If true the dimension order of Tensor is [N, C, H, W], otherwise is [N, H, W, C] Return: output_channel_last (Tensor): The flattened Tensor in channel_last style """ if channel_first: input_channel_last = paddle.transpose(inputs, perm=[0, 2, 3, 1]) else: input_channel_last = inputs output_channel_last = paddle.flatten( input_channel_last, start_axis=0, stop_axis=2) return output_channel_last @register class FCOSLoss(nn.Layer): """ FCOSLoss Args: loss_alpha (float): alpha in focal loss loss_gamma (float): gamma in focal loss iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU reg_weights (float): weight for location loss quality (str): quality branch, centerness/iou """ def __init__(self, loss_alpha=0.25, loss_gamma=2.0, iou_loss_type="giou", reg_weights=1.0, quality='centerness'): super(FCOSLoss, self).__init__() self.loss_alpha = loss_alpha self.loss_gamma = loss_gamma self.iou_loss_type = iou_loss_type self.reg_weights = reg_weights self.quality = quality def _iou_loss(self, pred, targets, positive_mask, weights=None, return_iou=False): """ Calculate the loss for location prediction Args: pred (Tensor): bounding boxes prediction targets (Tensor): targets for positive samples positive_mask (Tensor): mask of positive samples weights (Tensor): weights for each positive samples Return: loss (Tensor): location loss """ plw = pred[:, 0] * positive_mask pth = pred[:, 1] * positive_mask prw = pred[:, 2] * positive_mask pbh = pred[:, 3] * positive_mask tlw = targets[:, 0] * positive_mask tth = targets[:, 1] * positive_mask trw = targets[:, 2] * positive_mask tbh = targets[:, 3] * positive_mask tlw.stop_gradient = True trw.stop_gradient = True tth.stop_gradient = True tbh.stop_gradient = True ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) clw = paddle.maximum(plw, tlw) crw = paddle.maximum(prw, trw) cth = paddle.maximum(pth, tth) cbh = paddle.maximum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) ious = ious * positive_mask if return_iou: return ious if self.iou_loss_type.lower() == "linear_iou": loss = 1.0 - ious elif self.iou_loss_type.lower() == "giou": area_uniou = area_predict + area_target - area_inter area_circum = (clw + crw) * (cth + cbh) + 1e-7 giou = ious - (area_circum - area_uniou) / area_circum loss = 1.0 - giou elif self.iou_loss_type.lower() == "iou": loss = 0.0 - paddle.log(ious) else: raise KeyError if weights is not None: loss = loss * weights return loss def forward(self, cls_logits, bboxes_reg, centerness, tag_labels, tag_bboxes, tag_center): """ Calculate the loss for classification, location and centerness Args: cls_logits (list): list of Tensor, which is predicted score for all anchor points with shape [N, M, C] bboxes_reg (list): list of Tensor, which is predicted offsets for all anchor points with shape [N, M, 4] centerness (list): list of Tensor, which is predicted centerness for all anchor points with shape [N, M, 1] tag_labels (list): list of Tensor, which is category targets for each anchor point tag_bboxes (list): list of Tensor, which is bounding boxes targets for positive samples tag_center (list): list of Tensor, which is centerness targets for positive samples Return: loss (dict): loss composed by classification loss, bounding box """ cls_logits_flatten_list = [] bboxes_reg_flatten_list = [] centerness_flatten_list = [] tag_labels_flatten_list = [] tag_bboxes_flatten_list = [] tag_center_flatten_list = [] num_lvl = len(cls_logits) for lvl in range(num_lvl): cls_logits_flatten_list.append( flatten_tensor(cls_logits[lvl], True)) bboxes_reg_flatten_list.append( flatten_tensor(bboxes_reg[lvl], True)) centerness_flatten_list.append( flatten_tensor(centerness[lvl], True)) tag_labels_flatten_list.append( flatten_tensor(tag_labels[lvl], False)) tag_bboxes_flatten_list.append( flatten_tensor(tag_bboxes[lvl], False)) tag_center_flatten_list.append( flatten_tensor(tag_center[lvl], False)) cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0) bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0) centerness_flatten = paddle.concat(centerness_flatten_list, axis=0) tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0) tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0) tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0) tag_labels_flatten.stop_gradient = True tag_bboxes_flatten.stop_gradient = True tag_center_flatten.stop_gradient = True mask_positive_bool = tag_labels_flatten > 0 mask_positive_bool.stop_gradient = True mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32") mask_positive_float.stop_gradient = True num_positive_fp32 = paddle.sum(mask_positive_float) num_positive_fp32.stop_gradient = True num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32") num_positive_int32 = num_positive_int32 * 0 + 1 num_positive_int32.stop_gradient = True normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float) normalize_sum.stop_gradient = True # 1. cls_logits: sigmoid_focal_loss # expand onehot labels num_classes = cls_logits_flatten.shape[-1] tag_labels_flatten = paddle.squeeze(tag_labels_flatten, axis=-1) tag_labels_flatten_bin = F.one_hot( tag_labels_flatten, num_classes=1 + num_classes) tag_labels_flatten_bin = tag_labels_flatten_bin[:, 1:] # sigmoid_focal_loss cls_loss = F.sigmoid_focal_loss( cls_logits_flatten, tag_labels_flatten_bin) / num_positive_fp32 if self.quality == 'centerness': # 2. bboxes_reg: giou_loss mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) reg_loss = self._iou_loss( bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, weights=tag_center_flatten) reg_loss = reg_loss * mask_positive_float / normalize_sum # 3. centerness: sigmoid_cross_entropy_with_logits_loss centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1) quality_loss = ops.sigmoid_cross_entropy_with_logits( centerness_flatten, tag_center_flatten) quality_loss = quality_loss * mask_positive_float / num_positive_fp32 elif self.quality == 'iou': # 2. bboxes_reg: giou_loss mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) reg_loss = self._iou_loss( bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, weights=None) reg_loss = reg_loss * mask_positive_float / num_positive_fp32 # num_positive_fp32 is num_foreground # 3. centerness: sigmoid_cross_entropy_with_logits_loss centerness_flatten = paddle.squeeze(centerness_flatten, axis=-1) gt_ious = self._iou_loss( bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, weights=None, return_iou=True) quality_loss = ops.sigmoid_cross_entropy_with_logits( centerness_flatten, gt_ious) quality_loss = quality_loss * mask_positive_float / num_positive_fp32 else: raise Exception(f'Unknown quality type: {self.quality}') loss_all = { "loss_cls": paddle.sum(cls_loss), "loss_box": paddle.sum(reg_loss), "loss_quality": paddle.sum(quality_loss), } return loss_all @register class FCOSLossMILC(FCOSLoss): """ FCOSLossMILC for ARSL in semi-det(ssod) Args: loss_alpha (float): alpha in focal loss loss_gamma (float): gamma in focal loss iou_loss_type (str): location loss type, IoU/GIoU/LINEAR_IoU reg_weights (float): weight for location loss """ def __init__(self, loss_alpha=0.25, loss_gamma=2.0, iou_loss_type="giou", reg_weights=1.0): super(FCOSLossMILC, self).__init__() self.loss_alpha = loss_alpha self.loss_gamma = loss_gamma self.iou_loss_type = iou_loss_type self.reg_weights = reg_weights def iou_loss(self, pred, targets, weights=None, avg_factor=None): """ Calculate the loss for location prediction Args: pred (Tensor): bounding boxes prediction targets (Tensor): targets for positive samples weights (Tensor): weights for each positive samples Return: loss (Tensor): location loss """ plw = pred[:, 0] pth = pred[:, 1] prw = pred[:, 2] pbh = pred[:, 3] tlw = targets[:, 0] tth = targets[:, 1] trw = targets[:, 2] tbh = targets[:, 3] tlw.stop_gradient = True trw.stop_gradient = True tth.stop_gradient = True tbh.stop_gradient = True ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) clw = paddle.maximum(plw, tlw) crw = paddle.maximum(prw, trw) cth = paddle.maximum(pth, tth) cbh = paddle.maximum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) ious = ious if self.iou_loss_type.lower() == "linear_iou": loss = 1.0 - ious elif self.iou_loss_type.lower() == "giou": area_uniou = area_predict + area_target - area_inter area_circum = (clw + crw) * (cth + cbh) + 1e-7 giou = ious - (area_circum - area_uniou) / area_circum loss = 1.0 - giou elif self.iou_loss_type.lower() == "iou": loss = 0.0 - paddle.log(ious) else: raise KeyError if weights is not None: loss = loss * weights loss = paddle.sum(loss) if avg_factor is not None: loss = loss / avg_factor return loss # temp function: calcualate iou between bbox and target def _bbox_overlap_align(self, pred, targets): assert pred.shape[0] == targets.shape[0], \ 'the pred should be aligned with target.' plw = pred[:, 0] pth = pred[:, 1] prw = pred[:, 2] pbh = pred[:, 3] tlw = targets[:, 0] tth = targets[:, 1] trw = targets[:, 2] tbh = targets[:, 3] ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) return ious def iou_based_soft_label_loss(self, pred, target, alpha=0.75, gamma=2.0, iou_weighted=False, implicit_iou=None, avg_factor=None): assert pred.shape == target.shape pred = F.sigmoid(pred) target = target.cast(pred.dtype) if implicit_iou is not None: pred = pred * implicit_iou if iou_weighted: focal_weight = (pred - target).abs().pow(gamma) * target * (target > 0.0).cast('float32') + \ alpha * (pred - target).abs().pow(gamma) * \ (target <= 0.0).cast('float32') else: focal_weight = (pred - target).abs().pow(gamma) * (target > 0.0).cast('float32') + \ alpha * (pred - target).abs().pow(gamma) * \ (target <= 0.0).cast('float32') # focal loss loss = F.binary_cross_entropy( pred, target, reduction='none') * focal_weight if avg_factor is not None: loss = loss / avg_factor return loss def forward(self, cls_logits, bboxes_reg, centerness, tag_labels, tag_bboxes, tag_center): """ Calculate the loss for classification, location and centerness Args: cls_logits (list): list of Tensor, which is predicted score for all anchor points with shape [N, M, C] bboxes_reg (list): list of Tensor, which is predicted offsets for all anchor points with shape [N, M, 4] centerness (list): list of Tensor, which is predicted centerness for all anchor points with shape [N, M, 1] tag_labels (list): list of Tensor, which is category targets for each anchor point tag_bboxes (list): list of Tensor, which is bounding boxes targets for positive samples tag_center (list): list of Tensor, which is centerness targets for positive samples Return: loss (dict): loss composed by classification loss, bounding box """ cls_logits_flatten_list = [] bboxes_reg_flatten_list = [] centerness_flatten_list = [] tag_labels_flatten_list = [] tag_bboxes_flatten_list = [] tag_center_flatten_list = [] num_lvl = len(cls_logits) for lvl in range(num_lvl): cls_logits_flatten_list.append( flatten_tensor(cls_logits[lvl], True)) bboxes_reg_flatten_list.append( flatten_tensor(bboxes_reg[lvl], True)) centerness_flatten_list.append( flatten_tensor(centerness[lvl], True)) tag_labels_flatten_list.append( flatten_tensor(tag_labels[lvl], False)) tag_bboxes_flatten_list.append( flatten_tensor(tag_bboxes[lvl], False)) tag_center_flatten_list.append( flatten_tensor(tag_center[lvl], False)) cls_logits_flatten = paddle.concat(cls_logits_flatten_list, axis=0) bboxes_reg_flatten = paddle.concat(bboxes_reg_flatten_list, axis=0) centerness_flatten = paddle.concat(centerness_flatten_list, axis=0) tag_labels_flatten = paddle.concat(tag_labels_flatten_list, axis=0) tag_bboxes_flatten = paddle.concat(tag_bboxes_flatten_list, axis=0) tag_center_flatten = paddle.concat(tag_center_flatten_list, axis=0) tag_labels_flatten.stop_gradient = True tag_bboxes_flatten.stop_gradient = True tag_center_flatten.stop_gradient = True # find positive index mask_positive_bool = tag_labels_flatten > 0 mask_positive_bool.stop_gradient = True mask_positive_float = paddle.cast(mask_positive_bool, dtype="float32") mask_positive_float.stop_gradient = True num_positive_fp32 = paddle.sum(mask_positive_float) num_positive_fp32.stop_gradient = True num_positive_int32 = paddle.cast(num_positive_fp32, dtype="int32") num_positive_int32 = num_positive_int32 * 0 + 1 num_positive_int32.stop_gradient = True # centerness target is used as reg weight normalize_sum = paddle.sum(tag_center_flatten * mask_positive_float) normalize_sum.stop_gradient = True # 1. IoU-Based soft label loss # calculate iou with paddle.no_grad(): pos_ind = paddle.nonzero( tag_labels_flatten.reshape([-1]) > 0).reshape([-1]) pos_pred = bboxes_reg_flatten[pos_ind] pos_target = tag_bboxes_flatten[pos_ind] bbox_iou = self._bbox_overlap_align(pos_pred, pos_target) # pos labels pos_labels = tag_labels_flatten[pos_ind].squeeze(1) cls_target = paddle.zeros(cls_logits_flatten.shape) cls_target[pos_ind, pos_labels - 1] = bbox_iou cls_loss = self.iou_based_soft_label_loss( cls_logits_flatten, cls_target, implicit_iou=F.sigmoid(centerness_flatten), avg_factor=num_positive_fp32) # 2. bboxes_reg: giou_loss mask_positive_float = paddle.squeeze(mask_positive_float, axis=-1) tag_center_flatten = paddle.squeeze(tag_center_flatten, axis=-1) reg_loss = self._iou_loss( bboxes_reg_flatten, tag_bboxes_flatten, mask_positive_float, weights=tag_center_flatten) reg_loss = reg_loss * mask_positive_float / normalize_sum # 3. iou loss pos_iou_pred = paddle.squeeze(centerness_flatten, axis=-1)[pos_ind] loss_iou = ops.sigmoid_cross_entropy_with_logits(pos_iou_pred, bbox_iou) loss_iou = loss_iou / num_positive_fp32 * 0.5 loss_all = { "loss_cls": paddle.sum(cls_loss), "loss_box": paddle.sum(reg_loss), 'loss_iou': paddle.sum(loss_iou), } return loss_all # Concat multi-level feature maps by image def levels_to_images(mlvl_tensor): batch_size = mlvl_tensor[0].shape[0] batch_list = [[] for _ in range(batch_size)] channels = mlvl_tensor[0].shape[1] for t in mlvl_tensor: t = t.transpose([0, 2, 3, 1]) t = t.reshape([batch_size, -1, channels]) for img in range(batch_size): batch_list[img].append(t[img]) return [paddle.concat(item, axis=0) for item in batch_list] def multi_apply(func, *args, **kwargs): """Apply function to a list of arguments. Note: This function applies the ``func`` to multiple inputs and map the multiple outputs of the ``func`` into different list. Each list contains the same type of outputs corresponding to different inputs. Args: func (Function): A function that will be applied to a list of arguments Returns: tuple(list): A tuple containing multiple list, each list contains \ a kind of returned results by the function """ pfunc = partial(func, **kwargs) if kwargs else func map_results = map(pfunc, *args) return tuple(map(list, zip(*map_results))) @register class FCOSLossCR(FCOSLossMILC): """ FCOSLoss of Consistency Regularization """ def __init__(self, iou_loss_type="giou", cls_weight=2.0, reg_weight=2.0, iou_weight=0.5, hard_neg_mining_flag=True): super(FCOSLossCR, self).__init__() self.iou_loss_type = iou_loss_type self.cls_weight = cls_weight self.reg_weight = reg_weight self.iou_weight = iou_weight self.hard_neg_mining_flag = hard_neg_mining_flag def iou_loss(self, pred, targets, weights=None, avg_factor=None): """ Calculate the loss for location prediction Args: pred (Tensor): bounding boxes prediction targets (Tensor): targets for positive samples weights (Tensor): weights for each positive samples Return: loss (Tensor): location loss """ plw = pred[:, 0] pth = pred[:, 1] prw = pred[:, 2] pbh = pred[:, 3] tlw = targets[:, 0] tth = targets[:, 1] trw = targets[:, 2] tbh = targets[:, 3] tlw.stop_gradient = True trw.stop_gradient = True tth.stop_gradient = True tbh.stop_gradient = True ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) clw = paddle.maximum(plw, tlw) crw = paddle.maximum(prw, trw) cth = paddle.maximum(pth, tth) cbh = paddle.maximum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) ious = ious if self.iou_loss_type.lower() == "linear_iou": loss = 1.0 - ious elif self.iou_loss_type.lower() == "giou": area_uniou = area_predict + area_target - area_inter area_circum = (clw + crw) * (cth + cbh) + 1e-7 giou = ious - (area_circum - area_uniou) / area_circum loss = 1.0 - giou elif self.iou_loss_type.lower() == "iou": loss = 0.0 - paddle.log(ious) else: raise KeyError if weights is not None: loss = loss * weights loss = paddle.sum(loss) if avg_factor is not None: loss = loss / avg_factor return loss # calcualate iou between bbox and target def bbox_overlap_align(self, pred, targets): assert pred.shape[0] == targets.shape[0], \ 'the pred should be aligned with target.' plw = pred[:, 0] pth = pred[:, 1] prw = pred[:, 2] pbh = pred[:, 3] tlw = targets[:, 0] tth = targets[:, 1] trw = targets[:, 2] tbh = targets[:, 3] ilw = paddle.minimum(plw, tlw) irw = paddle.minimum(prw, trw) ith = paddle.minimum(pth, tth) ibh = paddle.minimum(pbh, tbh) area_predict = (plw + prw) * (pth + pbh) area_target = (tlw + trw) * (tth + tbh) area_inter = (ilw + irw) * (ith + ibh) ious = (area_inter + 1.0) / ( area_predict + area_target - area_inter + 1.0) return ious # cls loss: iou-based soft lable with joint iou def quality_focal_loss(self, stu_cls, targets, quality=None, weights=None, alpha=0.75, gamma=2.0, avg_factor='sum'): stu_cls = F.sigmoid(stu_cls) if quality is not None: stu_cls = stu_cls * F.sigmoid(quality) focal_weight = (stu_cls - targets).abs().pow(gamma) * (targets > 0.0).cast('float32') + \ alpha * (stu_cls - targets).abs().pow(gamma) * \ (targets <= 0.0).cast('float32') loss = F.binary_cross_entropy( stu_cls, targets, reduction='none') * focal_weight if weights is not None: loss = loss * weights.reshape([-1, 1]) loss = paddle.sum(loss) if avg_factor is not None: loss = loss / avg_factor return loss # generate points according to feature maps def compute_locations_by_level(self, fpn_stride, h, w): """ Compute locations of anchor points of each FPN layer Return: Anchor points locations of current FPN feature map """ shift_x = paddle.arange(0, w * fpn_stride, fpn_stride) shift_y = paddle.arange(0, h * fpn_stride, fpn_stride) shift_x = paddle.unsqueeze(shift_x, axis=0) shift_y = paddle.unsqueeze(shift_y, axis=1) shift_x = paddle.expand(shift_x, shape=[h, w]) shift_y = paddle.expand(shift_y, shape=[h, w]) shift_x = paddle.reshape(shift_x, shape=[-1]) shift_y = paddle.reshape(shift_y, shape=[-1]) location = paddle.stack( [shift_x, shift_y], axis=-1) + float(fpn_stride) / 2 return location # decode bbox from ltrb to x1y1x2y2 def decode_bbox(self, ltrb, points): assert ltrb.shape[0] == points.shape[0], \ "When decoding bbox in one image, the num of loc should be same with points." bbox_decoding = paddle.stack( [ points[:, 0] - ltrb[:, 0], points[:, 1] - ltrb[:, 1], points[:, 0] + ltrb[:, 2], points[:, 1] + ltrb[:, 3] ], axis=1) return bbox_decoding # encode bbox from x1y1x2y2 to ltrb def encode_bbox(self, bbox, points): assert bbox.shape[0] == points.shape[0], \ "When encoding bbox in one image, the num of bbox should be same with points." bbox_encoding = paddle.stack( [ points[:, 0] - bbox[:, 0], points[:, 1] - bbox[:, 1], bbox[:, 2] - points[:, 0], bbox[:, 3] - points[:, 1] ], axis=1) return bbox_encoding def calcualate_iou(self, gt_bbox, predict_bbox): # bbox area gt_area = (gt_bbox[:, 2] - gt_bbox[:, 0]) * \ (gt_bbox[:, 3] - gt_bbox[:, 1]) predict_area = (predict_bbox[:, 2] - predict_bbox[:, 0]) * \ (predict_bbox[:, 3] - predict_bbox[:, 1]) # overlop area lt = paddle.fmax(gt_bbox[:, None, :2], predict_bbox[None, :, :2]) rb = paddle.fmin(gt_bbox[:, None, 2:], predict_bbox[None, :, 2:]) wh = paddle.clip(rb - lt, min=0) overlap = wh[..., 0] * wh[..., 1] # iou iou = overlap / (gt_area[:, None] + predict_area[None, :] - overlap) return iou # select potential positives from hard negatives def hard_neg_mining(self, cls_score, loc_ltrb, quality, pos_ind, hard_neg_ind, loc_mask, loc_targets, iou_thresh=0.6): # get points locations and strides points_list = [] strides_list = [] scale_list = [] scale = [0, 1, 2, 3, 4] for fpn_scale, fpn_stride, HW in zip(scale, self.fpn_stride, self.lvl_hw): h, w = HW lvl_points = self.compute_locations_by_level(fpn_stride, h, w) points_list.append(lvl_points) lvl_strides = paddle.full([h * w, 1], fpn_stride) strides_list.append(lvl_strides) lvl_scales = paddle.full([h * w, 1], fpn_scale) scale_list.append(lvl_scales) points = paddle.concat(points_list, axis=0) strides = paddle.concat(strides_list, axis=0) scales = paddle.concat(scale_list, axis=0) # cls scores cls_vals = F.sigmoid(cls_score) * F.sigmoid(quality) max_vals = paddle.max(cls_vals, axis=-1) class_ind = paddle.argmax(cls_vals, axis=-1) ### calculate iou between positive and hard negative # decode pos bbox pos_cls = max_vals[pos_ind] pos_loc = loc_ltrb[pos_ind].reshape([-1, 4]) pos_strides = strides[pos_ind] pos_points = points[pos_ind].reshape([-1, 2]) pos_loc = pos_loc * pos_strides pos_bbox = self.decode_bbox(pos_loc, pos_points) pos_scales = scales[pos_ind] # decode hard negative bbox hard_neg_loc = loc_ltrb[hard_neg_ind].reshape([-1, 4]) hard_neg_strides = strides[hard_neg_ind] hard_neg_points = points[hard_neg_ind].reshape([-1, 2]) hard_neg_loc = hard_neg_loc * hard_neg_strides hard_neg_bbox = self.decode_bbox(hard_neg_loc, hard_neg_points) hard_neg_scales = scales[hard_neg_ind] # iou between pos bbox and hard negative bbox hard_neg_pos_iou = self.calcualate_iou(hard_neg_bbox, pos_bbox) ### select potential positives from hard negatives # scale flag scale_temp = paddle.abs( pos_scales.reshape([-1])[None, :] - hard_neg_scales.reshape([-1]) [:, None]) scale_flag = (scale_temp <= 1.) # iou flag iou_flag = (hard_neg_pos_iou >= iou_thresh) # same class flag pos_class = class_ind[pos_ind] hard_neg_class = class_ind[hard_neg_ind] class_flag = pos_class[None, :] - hard_neg_class[:, None] class_flag = (class_flag == 0) # hard negative point inside positive bbox flag ltrb_temp = paddle.stack( [ hard_neg_points[:, None, 0] - pos_bbox[None, :, 0], hard_neg_points[:, None, 1] - pos_bbox[None, :, 1], pos_bbox[None, :, 2] - hard_neg_points[:, None, 0], pos_bbox[None, :, 3] - hard_neg_points[:, None, 1] ], axis=-1) inside_flag = ltrb_temp.min(axis=-1) > 0 # reset iou valid_flag = (iou_flag & class_flag & inside_flag & scale_flag) invalid_iou = paddle.zeros_like(hard_neg_pos_iou) hard_neg_pos_iou = paddle.where(valid_flag, hard_neg_pos_iou, invalid_iou) pos_hard_neg_max_iou = hard_neg_pos_iou.max(axis=-1) # selece potential pos potential_pos_ind = (pos_hard_neg_max_iou > 0.) num_potential_pos = paddle.nonzero(potential_pos_ind).shape[0] if num_potential_pos == 0: return None ### calculate loc target:aggregate all matching bboxes as the bbox targets of potential pos # prepare data potential_points = hard_neg_points[potential_pos_ind].reshape([-1, 2]) potential_strides = hard_neg_strides[potential_pos_ind] potential_valid_flag = valid_flag[potential_pos_ind] potential_pos_ind = hard_neg_ind[potential_pos_ind] # get cls and box of matching positives pos_cls = max_vals[pos_ind] expand_pos_bbox = paddle.expand( pos_bbox, shape=[num_potential_pos, pos_bbox.shape[0], pos_bbox.shape[1]]) expand_pos_cls = paddle.expand( pos_cls, shape=[num_potential_pos, pos_cls.shape[0]]) invalid_cls = paddle.zeros_like(expand_pos_cls) expand_pos_cls = paddle.where(potential_valid_flag, expand_pos_cls, invalid_cls) expand_pos_cls = paddle.unsqueeze(expand_pos_cls, axis=-1) # aggregate box based on cls_score agg_bbox = (expand_pos_bbox * expand_pos_cls).sum(axis=1) \ / expand_pos_cls.sum(axis=1) agg_ltrb = self.encode_bbox(agg_bbox, potential_points) agg_ltrb = agg_ltrb / potential_strides # loc target for all pos loc_targets[potential_pos_ind] = agg_ltrb loc_mask[potential_pos_ind] = 1. return loc_mask, loc_targets # get training targets def get_targets_per_img(self, tea_cls, tea_loc, tea_iou, stu_cls, stu_loc, stu_iou): ### sample selection # prepare datas tea_cls_scores = F.sigmoid(tea_cls) * F.sigmoid(tea_iou) class_ind = paddle.argmax(tea_cls_scores, axis=-1) max_vals = paddle.max(tea_cls_scores, axis=-1) cls_mask = paddle.zeros_like( max_vals ) # set cls valid mask: pos is 1, hard_negative and negative are 0. num_pos, num_hard_neg = 0, 0 # mean-std selection # using nonzero to turn index from bool to int, because the index will be used to compose two-dim index in following. # using squeeze rather than reshape to avoid errors when no score is larger than thresh. candidate_ind = paddle.nonzero(max_vals >= 0.1).squeeze(axis=-1) num_candidate = candidate_ind.shape[0] if num_candidate > 0: # pos thresh = mean + std to select pos samples candidate_score = max_vals[candidate_ind] candidate_score_mean = candidate_score.mean() candidate_score_std = candidate_score.std() pos_thresh = (candidate_score_mean + candidate_score_std).clip( max=0.4) # select pos pos_ind = paddle.nonzero(max_vals >= pos_thresh).squeeze(axis=-1) num_pos = pos_ind.shape[0] # select hard negatives as potential pos hard_neg_ind = (max_vals >= 0.1) & (max_vals < pos_thresh) hard_neg_ind = paddle.nonzero(hard_neg_ind).squeeze(axis=-1) num_hard_neg = hard_neg_ind.shape[0] # if not positive, directly select top-10 as pos. if (num_pos == 0): num_pos = 10 _, pos_ind = paddle.topk(max_vals, k=num_pos) cls_mask[pos_ind] = 1. ### Consistency Regularization Training targets # cls targets pos_class_ind = class_ind[pos_ind] cls_targets = paddle.zeros_like(tea_cls) cls_targets[pos_ind, pos_class_ind] = tea_cls_scores[pos_ind, pos_class_ind] # hard negative cls target if num_hard_neg != 0: cls_targets[hard_neg_ind] = tea_cls_scores[hard_neg_ind] # loc targets loc_targets = paddle.zeros_like(tea_loc) loc_targets[pos_ind] = tea_loc[pos_ind] # iou targets iou_targets = paddle.zeros( shape=[tea_iou.shape[0]], dtype=tea_iou.dtype) iou_targets[pos_ind] = F.sigmoid( paddle.squeeze( tea_iou, axis=-1)[pos_ind]) loc_mask = cls_mask.clone() # select potential positive from hard negatives for loc_task training if (num_hard_neg > 0) and self.hard_neg_mining_flag: results = self.hard_neg_mining(tea_cls, tea_loc, tea_iou, pos_ind, hard_neg_ind, loc_mask, loc_targets) if results is not None: loc_mask, loc_targets = results loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1) iou_targets[loc_pos_ind] = F.sigmoid( paddle.squeeze( tea_iou, axis=-1)[loc_pos_ind]) return cls_mask, loc_mask, \ cls_targets, loc_targets, iou_targets def forward(self, student_prediction, teacher_prediction): stu_cls_lvl, stu_loc_lvl, stu_iou_lvl = student_prediction tea_cls_lvl, tea_loc_lvl, tea_iou_lvl, self.fpn_stride = teacher_prediction # H and W of level (used for aggregating targets) self.lvl_hw = [] for t in tea_cls_lvl: _, _, H, W = t.shape self.lvl_hw.append([H, W]) # levels to images stu_cls_img = levels_to_images(stu_cls_lvl) stu_loc_img = levels_to_images(stu_loc_lvl) stu_iou_img = levels_to_images(stu_iou_lvl) tea_cls_img = levels_to_images(tea_cls_lvl) tea_loc_img = levels_to_images(tea_loc_lvl) tea_iou_img = levels_to_images(tea_iou_lvl) with paddle.no_grad(): cls_mask, loc_mask, \ cls_targets, loc_targets, iou_targets = multi_apply( self.get_targets_per_img, tea_cls_img, tea_loc_img, tea_iou_img, stu_cls_img, stu_loc_img, stu_iou_img ) # flatten preditction stu_cls = paddle.concat(stu_cls_img, axis=0) stu_loc = paddle.concat(stu_loc_img, axis=0) stu_iou = paddle.concat(stu_iou_img, axis=0) # flatten targets cls_mask = paddle.concat(cls_mask, axis=0) loc_mask = paddle.concat(loc_mask, axis=0) cls_targets = paddle.concat(cls_targets, axis=0) loc_targets = paddle.concat(loc_targets, axis=0) iou_targets = paddle.concat(iou_targets, axis=0) ### Training Weights and avg factor # find positives cls_pos_ind = paddle.nonzero(cls_mask > 0.).squeeze(axis=-1) loc_pos_ind = paddle.nonzero(loc_mask > 0.).squeeze(axis=-1) # cls weight cls_sample_weights = paddle.ones([cls_targets.shape[0]]) cls_avg_factor = paddle.max(cls_targets[cls_pos_ind], axis=-1).sum().item() # loc weight loc_sample_weights = paddle.max(cls_targets[loc_pos_ind], axis=-1) loc_avg_factor = loc_sample_weights.sum().item() # iou weight iou_sample_weights = paddle.ones([loc_pos_ind.shape[0]]) iou_avg_factor = loc_pos_ind.shape[0] ### unsupervised loss # cls loss loss_cls = self.quality_focal_loss( stu_cls, cls_targets, quality=stu_iou, weights=cls_sample_weights, avg_factor=cls_avg_factor) * self.cls_weight # iou loss pos_stu_iou = paddle.squeeze(stu_iou, axis=-1)[loc_pos_ind] pos_iou_targets = iou_targets[loc_pos_ind] loss_iou = F.binary_cross_entropy( F.sigmoid(pos_stu_iou), pos_iou_targets, reduction='none') * iou_sample_weights loss_iou = loss_iou.sum() / iou_avg_factor * self.iou_weight # box loss pos_stu_loc = stu_loc[loc_pos_ind] pos_loc_targets = loc_targets[loc_pos_ind] loss_box = self.iou_loss( pos_stu_loc, pos_loc_targets, weights=loc_sample_weights, avg_factor=loc_avg_factor) loss_box = loss_box * self.reg_weight loss_all = { "loss_cls": loss_cls, "loss_box": loss_box, "loss_iou": loss_iou, } return loss_all