simota_assigner.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import paddle
import numpy as np
import paddle.nn.functional as F
import paddle.nn as nn

from ppdet.modeling.losses.varifocal_loss import varifocal_loss
from ppdet.modeling.bbox_utils import batch_bbox_overlaps
from ppdet.core.workspace import register


@register
class SimOTAAssigner(object):
    """Computes matching between predictions and ground truth.

    Args:
        center_radius (int | float, optional): Ground truth center size
            to judge whether a prior is in center. Default 2.5.
        candidate_topk (int, optional): The candidate top-k which used to
            get top-k ious to calculate dynamic-k. Default 10.
        iou_weight (int | float, optional): The scale factor for regression
            iou cost. Default 3.0.
        cls_weight (int | float, optional): The scale factor for classification
            cost. Default 1.0.
        num_classes (int): The num_classes of dataset.
        use_vfl (int): Whether to use varifocal_loss when calculating the cost matrix.
    """
    __shared__ = ['num_classes']

    def __init__(self,
                 center_radius=2.5,
                 candidate_topk=10,
                 iou_weight=3.0,
                 cls_weight=1.0,
                 num_classes=80,
                 use_vfl=True):
        self.center_radius = center_radius
        self.candidate_topk = candidate_topk
        self.iou_weight = iou_weight
        self.cls_weight = cls_weight
        self.num_classes = num_classes
        self.use_vfl = use_vfl

    def get_in_gt_and_in_center_info(self, priors, gt_bboxes):
        num_gt = gt_bboxes.shape[0]

        repeated_x = priors[:, 0].unsqueeze(1).tile([1, num_gt])
        repeated_y = priors[:, 1].unsqueeze(1).tile([1, num_gt])
        repeated_stride_x = priors[:, 2].unsqueeze(1).tile([1, num_gt])
        repeated_stride_y = priors[:, 3].unsqueeze(1).tile([1, num_gt])

        # is prior centers in gt bboxes, shape: [n_prior, n_gt]
        l_ = repeated_x - gt_bboxes[:, 0]
        t_ = repeated_y - gt_bboxes[:, 1]
        r_ = gt_bboxes[:, 2] - repeated_x
        b_ = gt_bboxes[:, 3] - repeated_y

        deltas = paddle.stack([l_, t_, r_, b_], axis=1)
        is_in_gts = deltas.min(axis=1) > 0
        is_in_gts_all = is_in_gts.sum(axis=1) > 0

        # is prior centers in gt centers
        gt_cxs = (gt_bboxes[:, 0] + gt_bboxes[:, 2]) / 2.0
        gt_cys = (gt_bboxes[:, 1] + gt_bboxes[:, 3]) / 2.0
        ct_box_l = gt_cxs - self.center_radius * repeated_stride_x
        ct_box_t = gt_cys - self.center_radius * repeated_stride_y
        ct_box_r = gt_cxs + self.center_radius * repeated_stride_x
        ct_box_b = gt_cys + self.center_radius * repeated_stride_y

        cl_ = repeated_x - ct_box_l
        ct_ = repeated_y - ct_box_t
        cr_ = ct_box_r - repeated_x
        cb_ = ct_box_b - repeated_y

        ct_deltas = paddle.stack([cl_, ct_, cr_, cb_], axis=1)
        is_in_cts = ct_deltas.min(axis=1) > 0
        is_in_cts_all = is_in_cts.sum(axis=1) > 0

        # in boxes or in centers, shape: [num_priors]
        is_in_gts_or_centers = paddle.logical_or(is_in_gts_all, is_in_cts_all)

        is_in_gts_or_centers_inds = paddle.nonzero(
            is_in_gts_or_centers).squeeze(1)

        # both in boxes and centers, shape: [num_fg, num_gt]
        is_in_boxes_and_centers = paddle.logical_and(
            paddle.gather(
                is_in_gts.cast('int'), is_in_gts_or_centers_inds,
                axis=0).cast('bool'),
            paddle.gather(
                is_in_cts.cast('int'), is_in_gts_or_centers_inds,
                axis=0).cast('bool'))
        return is_in_gts_or_centers, is_in_boxes_and_centers

    def dynamic_k_matching(self, cost, pairwise_ious, num_gt, valid_mask):
        matching_matrix = np.zeros_like(cost.numpy())
        # select candidate topk ious for dynamic-k calculation
        topk_ious, _ = paddle.topk(pairwise_ious, self.candidate_topk, axis=0)
        # calculate dynamic k for each gt
        dynamic_ks = paddle.clip(topk_ious.sum(0).cast('int'), min=1)
        for gt_idx in range(num_gt):
            _, pos_idx = paddle.topk(
                cost[:, gt_idx], k=dynamic_ks[gt_idx], largest=False)
            matching_matrix[:, gt_idx][pos_idx.numpy()] = 1.0

        del topk_ious, dynamic_ks, pos_idx

        prior_match_gt_mask = matching_matrix.sum(1) > 1
        if prior_match_gt_mask.sum() > 0:
            cost = cost.numpy()
            cost_argmin = np.argmin(cost[prior_match_gt_mask, :], axis=1)
            matching_matrix[prior_match_gt_mask, :] *= 0.0
            matching_matrix[prior_match_gt_mask, cost_argmin] = 1.0
        # get foreground mask inside box and center prior
        fg_mask_inboxes = matching_matrix.sum(1) > 0.0
        valid_mask[valid_mask.copy()] = fg_mask_inboxes

        matched_gt_inds = matching_matrix[fg_mask_inboxes, :].argmax(1)

        matched_gt_inds = paddle.to_tensor(
            matched_gt_inds, place=pairwise_ious.place)

        return matched_gt_inds, valid_mask

    def get_sample(self, assign_gt_inds, gt_bboxes):
        pos_inds = np.unique(np.nonzero(assign_gt_inds > 0)[0])
        neg_inds = np.unique(np.nonzero(assign_gt_inds == 0)[0])
        pos_assigned_gt_inds = assign_gt_inds[pos_inds] - 1

        if gt_bboxes.size == 0:
            # hack for index error case
            assert pos_assigned_gt_inds.size == 0
            pos_gt_bboxes = np.empty_like(gt_bboxes).reshape(-1, 4)
        else:
            if len(gt_bboxes.shape) < 2:
                gt_bboxes = gt_bboxes.resize(-1, 4)
            pos_gt_bboxes = gt_bboxes[pos_assigned_gt_inds, :]
        return pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds

    def __call__(self,
                 pred_scores,
                 priors,
                 decoded_bboxes,
                 gt_bboxes,
                 gt_labels,
                 eps=1e-7):
        """Assign gt to priors using SimOTA.
        TODO: add comment.
        Returns:
            assign_result: The assigned result.
        """

        INF = 100000000
        num_gt = gt_bboxes.shape[0]
        num_bboxes = decoded_bboxes.shape[0]

        # assign 0 by default
        assigned_gt_inds = paddle.full(
            (num_bboxes, ), 0, dtype=paddle.int64).numpy()
        if num_gt == 0 or num_bboxes == 0:
            # No ground truth or boxes, return empty assignment
            priors = priors.numpy()
            labels = np.ones([num_bboxes], dtype=np.int64) * self.num_classes
            label_weights = np.ones([num_bboxes], dtype=np.float32)
            bbox_targets = np.zeros_like(priors)
            return priors, labels, label_weights, bbox_targets, 0

        valid_mask, is_in_boxes_and_center = self.get_in_gt_and_in_center_info(
            priors, gt_bboxes)

        valid_mask_inds = paddle.nonzero(valid_mask).squeeze(1)
        valid_decoded_bbox = decoded_bboxes[valid_mask_inds]
        valid_pred_scores = pred_scores[valid_mask_inds]
        num_valid = valid_decoded_bbox.shape[0]

        pairwise_ious = batch_bbox_overlaps(valid_decoded_bbox, gt_bboxes)
        if self.use_vfl:
            gt_vfl_labels = gt_labels.squeeze(-1).unsqueeze(0).tile(
                [num_valid, 1]).reshape([-1])
            valid_pred_scores = valid_pred_scores.unsqueeze(1).tile(
                [1, num_gt, 1]).reshape([-1, self.num_classes])
            vfl_score = np.zeros(valid_pred_scores.shape)
            vfl_score[np.arange(0, vfl_score.shape[0]), gt_vfl_labels.numpy(
            )] = pairwise_ious.reshape([-1])
            vfl_score = paddle.to_tensor(vfl_score)
            losses_vfl = varifocal_loss(
                valid_pred_scores, vfl_score,
                use_sigmoid=False).reshape([num_valid, num_gt])
            losses_giou = batch_bbox_overlaps(
                valid_decoded_bbox, gt_bboxes, mode='giou')
            cost_matrix = (
                losses_vfl * self.cls_weight + losses_giou * self.iou_weight +
                paddle.logical_not(is_in_boxes_and_center).cast('float32') * INF
            )
        else:
            iou_cost = -paddle.log(pairwise_ious + eps)
            gt_onehot_label = (F.one_hot(
                gt_labels.squeeze(-1).cast(paddle.int64),
                pred_scores.shape[-1]).cast('float32').unsqueeze(0).tile(
                    [num_valid, 1, 1]))

            valid_pred_scores = valid_pred_scores.unsqueeze(1).tile(
                [1, num_gt, 1])
            cls_cost = F.binary_cross_entropy(
                valid_pred_scores, gt_onehot_label, reduction='none').sum(-1)

            cost_matrix = (
                cls_cost * self.cls_weight + iou_cost * self.iou_weight +
                paddle.logical_not(is_in_boxes_and_center).cast('float32') * INF
            )

        matched_gt_inds, valid_mask = \
            self.dynamic_k_matching(
                cost_matrix, pairwise_ious, num_gt, valid_mask.numpy())

        # assign results
        gt_labels = gt_labels.numpy()
        priors = priors.numpy()
        matched_gt_inds = matched_gt_inds.numpy()
        gt_bboxes = gt_bboxes.numpy()

        assigned_gt_inds[valid_mask] = matched_gt_inds + 1

        pos_inds, neg_inds, pos_gt_bboxes, pos_assigned_gt_inds \
            = self.get_sample(assigned_gt_inds, gt_bboxes)

        num_cells = priors.shape[0]
        bbox_targets = np.zeros_like(priors)
        bbox_weights = np.zeros_like(priors)
        labels = np.ones([num_cells], dtype=np.int64) * self.num_classes
        label_weights = np.zeros([num_cells], dtype=np.float32)

        if len(pos_inds) > 0:
            pos_bbox_targets = pos_gt_bboxes
            bbox_targets[pos_inds, :] = pos_bbox_targets
            bbox_weights[pos_inds, :] = 1.0
            if not np.any(gt_labels):
                labels[pos_inds] = 0
            else:
                labels[pos_inds] = gt_labels.squeeze(-1)[pos_assigned_gt_inds]

            label_weights[pos_inds] = 1.0
        if len(neg_inds) > 0:
            label_weights[neg_inds] = 1.0

        pos_num = max(pos_inds.size, 1)

        return priors, labels, label_weights, bbox_targets, pos_num