loss.py

# -*- coding: utf-8 -*-
# MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
#
# Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import megengine.functional as F
from megengine.core import Tensor

from official.vision.detection import layers


def get_focal_loss(
    logits: Tensor,
    labels: Tensor,
    ignore_label: int = -1,
    background: int = 0,
    alpha: float = 0.5,
    gamma: float = 0,
    norm_type: str = "fg",
) -> Tensor:
    r"""Focal Loss for Dense Object Detection:
    <https://arxiv.org/pdf/1708.02002.pdf>

    .. math::

        FL(p_t) = -\alpha_t(1-p_t)^\gamma \log(p_t)

    Args:
        logits (Tensor):
            the predicted logits with the shape of :math:`(B, A, C)`
        labels (Tensor):
            the assigned labels of boxes with shape of :math:`(B, A)`
        ignore_label (int):
            the value of ignore class. Default: -1
        background (int):
            the value of background class. Default: 0
        alpha (float):
            parameter to mitigate class imbalance. Default: 0.5
        gamma (float):
            parameter to mitigate easy/hard loss imbalance. Default: 0
        norm_type (str): current support "fg", "none":
            "fg": loss will be normalized by number of fore-ground samples
            "none": not norm

    Returns:
        the calculated focal loss.
    """
    class_range = F.arange(1, logits.shape[2] + 1)

    labels = F.add_axis(labels, axis=2)
    scores = F.sigmoid(logits)
    pos_part = (1 - scores) ** gamma * layers.logsigmoid(logits)
    neg_part = scores ** gamma * layers.logsigmoid(-logits)

    pos_loss = -(labels == class_range) * pos_part * alpha
    neg_loss = (
        -(labels != class_range) * (labels != ignore_label) * neg_part * (1 - alpha)
    )
    loss = (pos_loss + neg_loss).sum()

    if norm_type == "fg":
        fg_mask = (labels != background) * (labels != ignore_label)
        return loss / F.maximum(fg_mask.sum(), 1)
    elif norm_type == "none":
        return loss
    else:
        raise NotImplementedError


def get_smooth_l1_loss(
    pred_bbox: Tensor,
    gt_bbox: Tensor,
    labels: Tensor,
    beta: int = 1,
    background: int = 0,
    ignore_label: int = -1,
    norm_type: str = "fg",
) -> Tensor:
    r"""Smooth l1 loss used in RetinaNet.

    Args:
        pred_bbox (Tensor):
            the predicted bbox with the shape of :math:`(B, A, 4)`
        gt_bbox (Tensor):
            the ground-truth bbox with the shape of :math:`(B, A, 4)`
        labels (Tensor):
            the assigned labels of boxes with shape of :math:`(B, A)`
        beta (int):
            the parameter of smooth l1 loss. Default: 1
        background (int):
            the value of background class. Default: 0
        ignore_label (int):
            the value of ignore class. Default: -1
        norm_type (str): current support "fg", "all", "none":
            "fg": loss will be normalized by number of fore-ground samples
            "all": loss will be normalized by number of all samples
            "none": not norm
    Returns:
        the calculated smooth l1 loss.
    """
    pred_bbox = pred_bbox.reshape(-1, 4)
    gt_bbox = gt_bbox.reshape(-1, 4)
    labels = labels.reshape(-1)

    fg_mask = (labels != background) * (labels != ignore_label)

    loss = get_smooth_l1_base(pred_bbox, gt_bbox, beta)
    loss = (loss.sum(axis=1) * fg_mask).sum()
    if norm_type == "fg":
        loss = loss / F.maximum(fg_mask.sum(), 1)
    elif norm_type == "all":
        all_mask = labels != ignore_label
        loss = loss / F.maximum(all_mask.sum(), 1)
    elif norm_type == "none":
        return loss
    else:
        raise NotImplementedError

    return loss


def get_smooth_l1_base(pred_bbox: Tensor, gt_bbox: Tensor, beta: float) -> Tensor:
    r"""

    Args:
        pred_bbox (Tensor):
            the predicted bbox with the shape of :math:`(N, 4)`
        gt_bbox (Tensor):
            the ground-truth bbox with the shape of :math:`(N, 4)`
        beta (int):
            the parameter of smooth l1 loss.

    Returns:
        the calculated smooth l1 loss.
    """
    x = pred_bbox - gt_bbox
    abs_x = F.abs(x)
    if beta < 1e-5:
        loss = abs_x
    else:
        in_loss = 0.5 * x ** 2 / beta
        out_loss = abs_x - 0.5 * beta

        # FIXME: F.where cannot handle 0-shape tensor yet
        # loss = F.where(abs_x < beta, in_loss, out_loss)
        in_mask = abs_x < beta
        loss = in_loss * in_mask + out_loss * (1 - in_mask)
    return loss


def softmax_loss(scores: Tensor, labels: Tensor, ignore_label: int = -1) -> Tensor:
    max_scores = F.zero_grad(scores.max(axis=1, keepdims=True))
    scores -= max_scores
    log_prob = scores - F.log(F.exp(scores).sum(axis=1, keepdims=True))
    mask = labels != ignore_label
    vlabels = labels * mask
    loss = -(F.indexing_one_hot(log_prob, vlabels.astype("int32"), 1) * mask).sum()
    loss = loss / F.maximum(mask.sum(), 1)
    return loss