petr_head.py

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. 
#   
# Licensed under the Apache License, Version 2.0 (the "License");   
# you may not use this file except in compliance with the License.  
# You may obtain a copy of the License at   
#   
#     http://www.apache.org/licenses/LICENSE-2.0    
#   
# Unless required by applicable law or agreed to in writing, software   
# distributed under the License is distributed on an "AS IS" BASIS, 
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  
# See the License for the specific language governing permissions and   
# limitations under the License.
"""
this code is base on https://github.com/hikvision-research/opera/blob/main/opera/models/dense_heads/petr_head.py
"""
import copy
import numpy as np

import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
import paddle.distributed as dist

from ..transformers.petr_transformer import inverse_sigmoid, masked_fill
from ..initializer import constant_, normal_

__all__ = ["PETRHead"]

from functools import partial


def bias_init_with_prob(prior_prob: float) -> float:
    """initialize conv/fc bias value according to a given probability value."""
    bias_init = float(-np.log((1 - prior_prob) / prior_prob))
    return bias_init


def multi_apply(func, *args, **kwargs):
    """Apply function to a list of arguments.

    Note:
        This function applies the ``func`` to multiple inputs and
        map the multiple outputs of the ``func`` into different
        list. Each list contains the same type of outputs corresponding
        to different inputs.

    Args:
        func (Function): A function that will be applied to a list of
            arguments

    Returns:
        tuple(list): A tuple containing multiple list, each list contains \
            a kind of returned results by the function
    """
    pfunc = partial(func, **kwargs) if kwargs else func
    map_results = map(pfunc, *args)
    res = tuple(map(list, zip(*map_results)))
    return res


def reduce_mean(tensor):
    """"Obtain the mean of tensor on different GPUs."""
    if not (dist.get_world_size() and dist.is_initialized()):
        return tensor
    tensor = tensor.clone()
    dist.all_reduce(
        tensor.divide(
            paddle.to_tensor(
                dist.get_world_size(), dtype='float32')),
        op=dist.ReduceOp.SUM)
    return tensor


def gaussian_radius(det_size, min_overlap=0.7):
    """calculate gaussian radius according to object size.
    """
    height, width = det_size

    a1 = 1
    b1 = (height + width)
    c1 = width * height * (1 - min_overlap) / (1 + min_overlap)
    sq1 = paddle.sqrt(b1**2 - 4 * a1 * c1)
    r1 = (b1 + sq1) / 2

    a2 = 4
    b2 = 2 * (height + width)
    c2 = (1 - min_overlap) * width * height
    sq2 = paddle.sqrt(b2**2 - 4 * a2 * c2)
    r2 = (b2 + sq2) / 2

    a3 = 4 * min_overlap
    b3 = -2 * min_overlap * (height + width)
    c3 = (min_overlap - 1) * width * height
    sq3 = paddle.sqrt(b3**2 - 4 * a3 * c3)
    r3 = (b3 + sq3) / 2
    return min(r1, r2, r3)


def gaussian2D(shape, sigma=1):
    m, n = [(ss - 1.) / 2. for ss in shape]
    y = paddle.arange(-m, m + 1, dtype="float32")[:, None]
    x = paddle.arange(-n, n + 1, dtype="float32")[None, :]
    # y, x = np.ogrid[-m:m + 1, -n:n + 1]

    h = paddle.exp(-(x * x + y * y) / (2 * sigma * sigma))
    h[h < np.finfo(np.float32).eps * h.max()] = 0
    return h


def draw_umich_gaussian(heatmap, center, radius, k=1):
    diameter = 2 * radius + 1
    gaussian = gaussian2D((diameter, diameter), sigma=diameter / 6)
    gaussian = paddle.to_tensor(gaussian, dtype=heatmap.dtype)

    x, y = int(center[0]), int(center[1])
    radius = int(radius)

    height, width = heatmap.shape[0:2]

    left, right = min(x, radius), min(width - x, radius + 1)
    top, bottom = min(y, radius), min(height - y, radius + 1)

    masked_heatmap = heatmap[y - top:y + bottom, x - left:x + right]
    masked_gaussian = gaussian[radius - top:radius + bottom, radius - left:
                               radius + right]
    # assert masked_gaussian.equal(1).float().sum() == 1
    if min(masked_gaussian.shape) > 0 and min(masked_heatmap.shape) > 0:
        heatmap[y - top:y + bottom, x - left:x + right] = paddle.maximum(
            masked_heatmap, masked_gaussian * k)
    return heatmap


@register
class PETRHead(nn.Layer):
    """Head of `End-to-End Multi-Person Pose Estimation with Transformers`.

    Args:
        num_classes (int): Number of categories excluding the background.
        in_channels (int): Number of channels in the input feature map.
        num_query (int): Number of query in Transformer.
        num_kpt_fcs (int, optional): Number of fully-connected layers used in
            `FFN`, which is then used for the keypoint regression head.
            Default 2.
        transformer (obj:`mmcv.ConfigDict`|dict): ConfigDict is used for
            building the Encoder and Decoder. Default: None.
        sync_cls_avg_factor (bool): Whether to sync the avg_factor of
            all ranks. Default to False.
        positional_encoding (obj:`mmcv.ConfigDict`|dict):
            Config for position encoding.
        loss_cls (obj:`mmcv.ConfigDict`|dict): Config of the
            classification loss. Default `CrossEntropyLoss`.
        loss_kpt (obj:`mmcv.ConfigDict`|dict): Config of the
            regression loss. Default `L1Loss`.
        loss_oks (obj:`mmcv.ConfigDict`|dict): Config of the
            regression oks loss. Default `OKSLoss`.
        loss_hm (obj:`mmcv.ConfigDict`|dict): Config of the
            regression heatmap loss. Default `NegLoss`.
        as_two_stage (bool) : Whether to generate the proposal from
            the outputs of encoder.
        with_kpt_refine (bool): Whether to refine the reference points
            in the decoder. Defaults to True.
        test_cfg (obj:`mmcv.ConfigDict`|dict): Testing config of
            transformer head.
        init_cfg (dict or list[dict], optional): Initialization config dict.
            Default: None.
    """
    __inject__ = [
        "transformer", "positional_encoding", "assigner", "sampler", "loss_cls",
        "loss_kpt", "loss_oks", "loss_hm", "loss_kpt_rpn", "loss_kpt_refine",
        "loss_oks_refine"
    ]

    def __init__(self,
                 num_classes,
                 in_channels,
                 num_query=100,
                 num_kpt_fcs=2,
                 num_keypoints=17,
                 transformer=None,
                 sync_cls_avg_factor=True,
                 positional_encoding='SinePositionalEncoding',
                 loss_cls='FocalLoss',
                 loss_kpt='L1Loss',
                 loss_oks='OKSLoss',
                 loss_hm='CenterFocalLoss',
                 with_kpt_refine=True,
                 assigner='PoseHungarianAssigner',
                 sampler='PseudoSampler',
                 loss_kpt_rpn='L1Loss',
                 loss_kpt_refine='L1Loss',
                 loss_oks_refine='opera.OKSLoss',
                 test_cfg=dict(max_per_img=100),
                 init_cfg=None,
                 **kwargs):
        # NOTE here use `AnchorFreeHead` instead of `TransformerHead`,
        # since it brings inconvenience when the initialization of
        # `AnchorFreeHead` is called.
        super().__init__()
        self.bg_cls_weight = 0
        self.sync_cls_avg_factor = sync_cls_avg_factor
        self.assigner = assigner
        self.sampler = sampler
        self.num_query = num_query
        self.num_classes = num_classes
        self.in_channels = in_channels
        self.num_kpt_fcs = num_kpt_fcs
        self.test_cfg = test_cfg
        self.fp16_enabled = False
        self.as_two_stage = transformer.as_two_stage
        self.with_kpt_refine = with_kpt_refine
        self.num_keypoints = num_keypoints
        self.loss_cls = loss_cls
        self.loss_kpt = loss_kpt
        self.loss_kpt_rpn = loss_kpt_rpn
        self.loss_kpt_refine = loss_kpt_refine
        self.loss_oks = loss_oks
        self.loss_oks_refine = loss_oks_refine
        self.loss_hm = loss_hm
        if self.loss_cls.use_sigmoid:
            self.cls_out_channels = num_classes
        else:
            self.cls_out_channels = num_classes + 1
        self.positional_encoding = positional_encoding
        self.transformer = transformer
        self.embed_dims = self.transformer.embed_dims
        # assert 'num_feats' in positional_encoding
        num_feats = positional_encoding.num_pos_feats
        assert num_feats * 2 == self.embed_dims, 'embed_dims should' \
            f' be exactly 2 times of num_feats. Found {self.embed_dims}' \
            f' and {num_feats}.'
        self._init_layers()
        self.init_weights()

    def _init_layers(self):
        """Initialize classification branch and keypoint branch of head."""

        fc_cls = nn.Linear(self.embed_dims, self.cls_out_channels)

        kpt_branch = []
        kpt_branch.append(nn.Linear(self.embed_dims, 512))
        kpt_branch.append(nn.ReLU())
        for _ in range(self.num_kpt_fcs):
            kpt_branch.append(nn.Linear(512, 512))
            kpt_branch.append(nn.ReLU())
        kpt_branch.append(nn.Linear(512, 2 * self.num_keypoints))
        kpt_branch = nn.Sequential(*kpt_branch)

        def _get_clones(module, N):
            return nn.LayerList([copy.deepcopy(module) for i in range(N)])

        # last kpt_branch is used to generate proposal from
        # encode feature map when as_two_stage is True.
        num_pred = (self.transformer.decoder.num_layers + 1) if \
            self.as_two_stage else self.transformer.decoder.num_layers

        if self.with_kpt_refine:
            self.cls_branches = _get_clones(fc_cls, num_pred)
            self.kpt_branches = _get_clones(kpt_branch, num_pred)
        else:
            self.cls_branches = nn.LayerList([fc_cls for _ in range(num_pred)])
            self.kpt_branches = nn.LayerList(
                [kpt_branch for _ in range(num_pred)])

        self.query_embedding = nn.Embedding(self.num_query, self.embed_dims * 2)

        refine_kpt_branch = []
        for _ in range(self.num_kpt_fcs):
            refine_kpt_branch.append(
                nn.Linear(self.embed_dims, self.embed_dims))
            refine_kpt_branch.append(nn.ReLU())
        refine_kpt_branch.append(nn.Linear(self.embed_dims, 2))
        refine_kpt_branch = nn.Sequential(*refine_kpt_branch)
        if self.with_kpt_refine:
            num_pred = self.transformer.refine_decoder.num_layers
            self.refine_kpt_branches = _get_clones(refine_kpt_branch, num_pred)
        self.fc_hm = nn.Linear(self.embed_dims, self.num_keypoints)

    def init_weights(self):
        """Initialize weights of the PETR head."""
        self.transformer.init_weights()
        if self.loss_cls.use_sigmoid:
            bias_init = bias_init_with_prob(0.01)
            for m in self.cls_branches:
                constant_(m.bias, bias_init)
        for m in self.kpt_branches:
            constant_(m[-1].bias, 0)
        # initialization of keypoint refinement branch
        if self.with_kpt_refine:
            for m in self.refine_kpt_branches:
                constant_(m[-1].bias, 0)
        # initialize bias for heatmap prediction
        bias_init = bias_init_with_prob(0.1)
        normal_(self.fc_hm.weight, std=0.01)
        constant_(self.fc_hm.bias, bias_init)

    def forward(self, mlvl_feats, img_metas):
        """Forward function.

        Args:
            mlvl_feats (tuple[Tensor]): Features from the upstream
                network, each is a 4D-tensor with shape
                (N, C, H, W).
            img_metas (list[dict]): List of image information.

        Returns:
            outputs_classes (Tensor): Outputs from the classification head,
                shape [nb_dec, bs, num_query, cls_out_channels]. Note
                cls_out_channels should include background.
            outputs_kpts (Tensor): Sigmoid outputs from the regression
                head with normalized coordinate format (cx, cy, w, h).
                Shape [nb_dec, bs, num_query, K*2].
            enc_outputs_class (Tensor): The score of each point on encode
                feature map, has shape (N, h*w, num_class). Only when
                as_two_stage is Ture it would be returned, otherwise
                `None` would be returned.
            enc_outputs_kpt (Tensor): The proposal generate from the
                encode feature map, has shape (N, h*w, K*2). Only when
                as_two_stage is Ture it would be returned, otherwise
                `None` would be returned.
        """

        batch_size = mlvl_feats[0].shape[0]
        input_img_h, input_img_w = img_metas[0]['batch_input_shape']
        img_masks = paddle.zeros(
            (batch_size, input_img_h, input_img_w), dtype=mlvl_feats[0].dtype)
        for img_id in range(batch_size):
            img_h, img_w, _ = img_metas[img_id]['img_shape']
            img_masks[img_id, :img_h, :img_w] = 1

        mlvl_masks = []
        mlvl_positional_encodings = []
        for feat in mlvl_feats:
            mlvl_masks.append(
                F.interpolate(
                    img_masks[None], size=feat.shape[-2:]).squeeze(0))
            mlvl_positional_encodings.append(
                self.positional_encoding(mlvl_masks[-1]).transpose(
                    [0, 3, 1, 2]))

        query_embeds = self.query_embedding.weight
        hs, init_reference, inter_references, \
            enc_outputs_class, enc_outputs_kpt, hm_proto, memory = \
                self.transformer(
                    mlvl_feats,
                    mlvl_masks,
                    query_embeds,
                    mlvl_positional_encodings,
                    kpt_branches=self.kpt_branches \
                        if self.with_kpt_refine else None,  # noqa:E501
                    cls_branches=self.cls_branches \
                        if self.as_two_stage else None  # noqa:E501
            )

        outputs_classes = []
        outputs_kpts = []

        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            outputs_class = self.cls_branches[lvl](hs[lvl])
            tmp_kpt = self.kpt_branches[lvl](hs[lvl])
            assert reference.shape[-1] == self.num_keypoints * 2
            tmp_kpt += reference
            outputs_kpt = F.sigmoid(tmp_kpt)
            outputs_classes.append(outputs_class)
            outputs_kpts.append(outputs_kpt)

        outputs_classes = paddle.stack(outputs_classes)
        outputs_kpts = paddle.stack(outputs_kpts)

        if hm_proto is not None:
            # get heatmap prediction (training phase)
            hm_memory, hm_mask = hm_proto
            hm_pred = self.fc_hm(hm_memory)
            hm_proto = (hm_pred.transpose((0, 3, 1, 2)), hm_mask)

        if self.as_two_stage:
            return outputs_classes, outputs_kpts, \
                enc_outputs_class, F.sigmoid(enc_outputs_kpt), \
                hm_proto, memory, mlvl_masks
        else:
            raise RuntimeError('only "as_two_stage=True" is supported.')

    def forward_refine(self, memory, mlvl_masks, refine_targets, losses,
                       img_metas):
        """Forward function.

        Args:
            mlvl_masks (tuple[Tensor]): The key_padding_mask from
                different level used for encoder and decoder,
                each is a 3D-tensor with shape (bs, H, W).
            losses (dict[str, Tensor]): A dictionary of loss components.
            img_metas (list[dict]): List of image information.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        kpt_preds, kpt_targets, area_targets, kpt_weights = refine_targets
        pos_inds = kpt_weights.sum(-1) > 0
        if not pos_inds.any():
            pos_kpt_preds = paddle.zeros_like(kpt_preds[:1])
            pos_img_inds = paddle.zeros([1], dtype="int64")
        else:
            pos_kpt_preds = kpt_preds[pos_inds]
            pos_img_inds = (pos_inds.nonzero() /
                            self.num_query).squeeze(1).astype("int64")
        hs, init_reference, inter_references = self.transformer.forward_refine(
            mlvl_masks,
            memory,
            pos_kpt_preds.detach(),
            pos_img_inds,
            kpt_branches=self.refine_kpt_branches
            if self.with_kpt_refine else None,  # noqa:E501
        )

        outputs_kpts = []

        for lvl in range(hs.shape[0]):
            if lvl == 0:
                reference = init_reference
            else:
                reference = inter_references[lvl - 1]
            reference = inverse_sigmoid(reference)
            tmp_kpt = self.refine_kpt_branches[lvl](hs[lvl])
            assert reference.shape[-1] == 2
            tmp_kpt += reference
            outputs_kpt = F.sigmoid(tmp_kpt)
            outputs_kpts.append(outputs_kpt)
        outputs_kpts = paddle.stack(outputs_kpts)

        if not self.training:
            return outputs_kpts

        num_valid_kpt = paddle.clip(
            reduce_mean(kpt_weights.sum()), min=1).item()
        num_total_pos = paddle.to_tensor(
            [outputs_kpts.shape[1]], dtype=kpt_weights.dtype)
        num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()

        if not pos_inds.any():
            for i, kpt_refine_preds in enumerate(outputs_kpts):
                loss_kpt = loss_oks = kpt_refine_preds.sum() * 0
                losses[f'd{i}.loss_kpt_refine'] = loss_kpt
                losses[f'd{i}.loss_oks_refine'] = loss_oks
                continue
            return losses

        batch_size = mlvl_masks[0].shape[0]
        factors = []
        for img_id in range(batch_size):
            img_h, img_w, _ = img_metas[img_id]['img_shape']
            factor = paddle.to_tensor(
                [img_w, img_h, img_w, img_h],
                dtype="float32").squeeze(-1).unsqueeze(0).tile(
                    (self.num_query, 1))
            factors.append(factor)
        factors = paddle.concat(factors, 0)
        factors = factors[pos_inds][:, :2].tile((1, kpt_preds.shape[-1] // 2))

        pos_kpt_weights = kpt_weights[pos_inds]
        pos_kpt_targets = kpt_targets[pos_inds]
        pos_kpt_targets_scaled = pos_kpt_targets * factors
        pos_areas = area_targets[pos_inds]
        pos_valid = kpt_weights[pos_inds][:, 0::2]
        for i, kpt_refine_preds in enumerate(outputs_kpts):
            if not pos_inds.any():
                print("refine kpt and oks skip")
                loss_kpt = loss_oks = kpt_refine_preds.sum() * 0
                losses[f'd{i}.loss_kpt_refine'] = loss_kpt
                losses[f'd{i}.loss_oks_refine'] = loss_oks
                continue

            # kpt L1 Loss
            pos_refine_preds = kpt_refine_preds.reshape(
                (kpt_refine_preds.shape[0], -1))
            loss_kpt = self.loss_kpt_refine(
                pos_refine_preds,
                pos_kpt_targets,
                pos_kpt_weights,
                avg_factor=num_valid_kpt)
            losses[f'd{i}.loss_kpt_refine'] = loss_kpt
            # kpt oks loss
            pos_refine_preds_scaled = pos_refine_preds * factors
            assert (pos_areas > 0).all()
            loss_oks = self.loss_oks_refine(
                pos_refine_preds_scaled,
                pos_kpt_targets_scaled,
                pos_valid,
                pos_areas,
                avg_factor=num_total_pos)
            losses[f'd{i}.loss_oks_refine'] = loss_oks
        return losses

    # over-write because img_metas are needed as inputs for bbox_head.
    def forward_train(self,
                      x,
                      img_metas,
                      gt_bboxes,
                      gt_labels=None,
                      gt_keypoints=None,
                      gt_areas=None,
                      gt_bboxes_ignore=None,
                      proposal_cfg=None,
                      **kwargs):
        """Forward function for training mode.

        Args:
            x (list[Tensor]): Features from backbone.
            img_metas (list[dict]): Meta information of each image, e.g.,
                image size, scaling factor, etc.
            gt_bboxes (list[Tensor]): Ground truth bboxes of the image,
                shape (num_gts, 4).
            gt_labels (list[Tensor]): Ground truth labels of each box,
                shape (num_gts,).
            gt_keypoints (list[Tensor]): Ground truth keypoints of the image,
                shape (num_gts, K*3).
            gt_areas (list[Tensor]): Ground truth mask areas of each box,
                shape (num_gts,).
            gt_bboxes_ignore (list[Tensor]): Ground truth bboxes to be
                ignored, shape (num_ignored_gts, 4).
            proposal_cfg (mmcv.Config): Test / postprocessing configuration,
                if None, test_cfg would be used.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        assert proposal_cfg is None, '"proposal_cfg" must be None'
        outs = self(x, img_metas)
        memory, mlvl_masks = outs[-2:]
        outs = outs[:-2]
        if gt_labels is None:
            loss_inputs = outs + (gt_bboxes, gt_keypoints, gt_areas, img_metas)
        else:
            loss_inputs = outs + (gt_bboxes, gt_labels, gt_keypoints, gt_areas,
                                  img_metas)
        losses_and_targets = self.loss(
            *loss_inputs, gt_bboxes_ignore=gt_bboxes_ignore)
        # losses = losses_and_targets
        losses, refine_targets = losses_and_targets
        # get pose refinement loss
        losses = self.forward_refine(memory, mlvl_masks, refine_targets, losses,
                                     img_metas)
        return losses

    def loss(self,
             all_cls_scores,
             all_kpt_preds,
             enc_cls_scores,
             enc_kpt_preds,
             enc_hm_proto,
             gt_bboxes_list,
             gt_labels_list,
             gt_keypoints_list,
             gt_areas_list,
             img_metas,
             gt_bboxes_ignore=None):
        """Loss function.

        Args:
            all_cls_scores (Tensor): Classification score of all
                decoder layers, has shape
                [nb_dec, bs, num_query, cls_out_channels].
            all_kpt_preds (Tensor): Sigmoid regression
                outputs of all decode layers. Each is a 4D-tensor with
                normalized coordinate format (x_{i}, y_{i}) and shape
                [nb_dec, bs, num_query, K*2].
            enc_cls_scores (Tensor): Classification scores of
                points on encode feature map, has shape
                (N, h*w, num_classes). Only be passed when as_two_stage is
                True, otherwise is None.
            enc_kpt_preds (Tensor): Regression results of each points
                on the encode feature map, has shape (N, h*w, K*2). Only be
                passed when as_two_stage is True, otherwise is None.
            gt_bboxes_list (list[Tensor]): Ground truth bboxes for each image
                with shape (num_gts, 4) in [tl_x, tl_y, br_x, br_y] format.
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (num_gts, ).
            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
                    ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
            gt_areas_list (list[Tensor]): Ground truth mask areas for each
                image with shape (num_gts, ).
            img_metas (list[dict]): List of image meta information.
            gt_bboxes_ignore (list[Tensor], optional): Bounding boxes
                which can be ignored for each image. Default None.

        Returns:
            dict[str, Tensor]: A dictionary of loss components.
        """
        assert gt_bboxes_ignore is None, \
            f'{self.__class__.__name__} only supports ' \
            f'for gt_bboxes_ignore setting to None.'

        num_dec_layers = len(all_cls_scores)
        all_gt_labels_list = [gt_labels_list for _ in range(num_dec_layers)]
        all_gt_keypoints_list = [
            gt_keypoints_list for _ in range(num_dec_layers)
        ]
        all_gt_areas_list = [gt_areas_list for _ in range(num_dec_layers)]
        img_metas_list = [img_metas for _ in range(num_dec_layers)]

        losses_cls, losses_kpt, losses_oks, kpt_preds_list, kpt_targets_list, \
            area_targets_list, kpt_weights_list = multi_apply(
                self.loss_single, all_cls_scores, all_kpt_preds,
                all_gt_labels_list, all_gt_keypoints_list,
                all_gt_areas_list, img_metas_list)

        loss_dict = dict()
        # loss of proposal generated from encode feature map.
        if enc_cls_scores is not None:
            binary_labels_list = [
                paddle.zeros_like(gt_labels_list[i])
                for i in range(len(img_metas))
            ]
            enc_loss_cls, enc_losses_kpt = \
                self.loss_single_rpn(
                    enc_cls_scores, enc_kpt_preds, binary_labels_list,
                    gt_keypoints_list, gt_areas_list, img_metas)
            loss_dict['enc_loss_cls'] = enc_loss_cls
            loss_dict['enc_loss_kpt'] = enc_losses_kpt

        # loss from the last decoder layer
        loss_dict['loss_cls'] = losses_cls[-1]
        loss_dict['loss_kpt'] = losses_kpt[-1]
        loss_dict['loss_oks'] = losses_oks[-1]
        # loss from other decoder layers
        num_dec_layer = 0
        for loss_cls_i, loss_kpt_i, loss_oks_i in zip(
                losses_cls[:-1], losses_kpt[:-1], losses_oks[:-1]):
            loss_dict[f'd{num_dec_layer}.loss_cls'] = loss_cls_i
            loss_dict[f'd{num_dec_layer}.loss_kpt'] = loss_kpt_i
            loss_dict[f'd{num_dec_layer}.loss_oks'] = loss_oks_i
            num_dec_layer += 1

        # losses of heatmap generated from P3 feature map
        hm_pred, hm_mask = enc_hm_proto
        loss_hm = self.loss_heatmap(hm_pred, hm_mask, gt_keypoints_list,
                                    gt_labels_list, gt_bboxes_list)
        loss_dict['loss_hm'] = loss_hm

        return loss_dict, (kpt_preds_list[-1], kpt_targets_list[-1],
                           area_targets_list[-1], kpt_weights_list[-1])

    def loss_heatmap(self, hm_pred, hm_mask, gt_keypoints, gt_labels,
                     gt_bboxes):
        assert hm_pred.shape[-2:] == hm_mask.shape[-2:]
        num_img, _, h, w = hm_pred.shape
        # placeholder of heatmap target (Gaussian distribution)
        hm_target = paddle.zeros(hm_pred.shape, hm_pred.dtype)
        for i, (gt_label, gt_bbox, gt_keypoint
                ) in enumerate(zip(gt_labels, gt_bboxes, gt_keypoints)):
            if gt_label.shape[0] == 0:
                continue
            gt_keypoint = gt_keypoint.reshape((gt_keypoint.shape[0], -1,
                                               3)).clone()
            gt_keypoint[..., :2] /= 8

            assert gt_keypoint[..., 0].max() <= w + 0.5  # new coordinate system
            assert gt_keypoint[..., 1].max() <= h + 0.5  # new coordinate system
            gt_bbox /= 8
            gt_w = gt_bbox[:, 2] - gt_bbox[:, 0]
            gt_h = gt_bbox[:, 3] - gt_bbox[:, 1]
            for j in range(gt_label.shape[0]):
                # get heatmap radius
                kp_radius = paddle.clip(
                    paddle.floor(
                        gaussian_radius(
                            (gt_h[j], gt_w[j]), min_overlap=0.9)),
                    min=0,
                    max=3)
                for k in range(self.num_keypoints):
                    if gt_keypoint[j, k, 2] > 0:
                        gt_kp = gt_keypoint[j, k, :2]
                        gt_kp_int = paddle.floor(gt_kp)
                        hm_target[i, k] = draw_umich_gaussian(
                            hm_target[i, k], gt_kp_int, kp_radius)
        # compute heatmap loss
        hm_pred = paddle.clip(
            F.sigmoid(hm_pred), min=1e-4, max=1 - 1e-4)  # refer to CenterNet
        loss_hm = self.loss_hm(
            hm_pred,
            hm_target.detach(),
            mask=~hm_mask.astype("bool").unsqueeze(1))
        return loss_hm

    def loss_single(self, cls_scores, kpt_preds, gt_labels_list,
                    gt_keypoints_list, gt_areas_list, img_metas):
        """Loss function for outputs from a single decoder layer of a single
        feature level.

        Args:
            cls_scores (Tensor): Box score logits from a single decoder layer
                for all images. Shape [bs, num_query, cls_out_channels].
            kpt_preds (Tensor): Sigmoid outputs from a single decoder layer
                for all images, with normalized coordinate (x_{i}, y_{i}) and
                shape [bs, num_query, K*2].
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (num_gts, ).
            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
                ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
            gt_areas_list (list[Tensor]): Ground truth mask areas for each
                image with shape (num_gts, ).
            img_metas (list[dict]): List of image meta information.

        Returns:
            dict[str, Tensor]: A dictionary of loss components for outputs from
                a single decoder layer.
        """
        num_imgs = cls_scores.shape[0]
        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
        kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]
        cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,
                                           gt_labels_list, gt_keypoints_list,
                                           gt_areas_list, img_metas)
        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
         area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets
        labels = paddle.concat(labels_list, 0)
        label_weights = paddle.concat(label_weights_list, 0)
        kpt_targets = paddle.concat(kpt_targets_list, 0)
        kpt_weights = paddle.concat(kpt_weights_list, 0)
        area_targets = paddle.concat(area_targets_list, 0)

        # classification loss
        cls_scores = cls_scores.reshape((-1, self.cls_out_channels))
        # construct weighted avg_factor to match with the official DETR repo
        cls_avg_factor = num_total_pos * 1.0 + \
            num_total_neg * self.bg_cls_weight
        if self.sync_cls_avg_factor:
            cls_avg_factor = reduce_mean(
                paddle.to_tensor(
                    [cls_avg_factor], dtype=cls_scores.dtype))
        cls_avg_factor = max(cls_avg_factor, 1)

        loss_cls = self.loss_cls(
            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)

        # Compute the average number of gt keypoints accross all gpus, for
        # normalization purposes
        num_total_pos = paddle.to_tensor([num_total_pos], dtype=loss_cls.dtype)
        num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()

        # construct factors used for rescale keypoints
        factors = []
        for img_meta, kpt_pred in zip(img_metas, kpt_preds):
            img_h, img_w, _ = img_meta['img_shape']
            factor = paddle.to_tensor(
                [img_w, img_h, img_w, img_h],
                dtype=kpt_pred.dtype).squeeze().unsqueeze(0).tile(
                    (kpt_pred.shape[0], 1))
            factors.append(factor)
        factors = paddle.concat(factors, 0)

        # keypoint regression loss
        kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))
        num_valid_kpt = paddle.clip(
            reduce_mean(kpt_weights.sum()), min=1).item()
        # assert num_valid_kpt == (kpt_targets>0).sum().item()
        loss_kpt = self.loss_kpt(
            kpt_preds,
            kpt_targets.detach(),
            kpt_weights.detach(),
            avg_factor=num_valid_kpt)

        # keypoint oks loss
        pos_inds = kpt_weights.sum(-1) > 0
        if not pos_inds.any():
            loss_oks = kpt_preds.sum() * 0
        else:
            factors = factors[pos_inds][:, :2].tile((
                (1, kpt_preds.shape[-1] // 2)))
            pos_kpt_preds = kpt_preds[pos_inds] * factors
            pos_kpt_targets = kpt_targets[pos_inds] * factors
            pos_areas = area_targets[pos_inds]
            pos_valid = kpt_weights[pos_inds][..., 0::2]
            assert (pos_areas > 0).all()
            loss_oks = self.loss_oks(
                pos_kpt_preds,
                pos_kpt_targets,
                pos_valid,
                pos_areas,
                avg_factor=num_total_pos)
        return loss_cls, loss_kpt, loss_oks, kpt_preds, kpt_targets, \
            area_targets, kpt_weights

    def get_targets(self, cls_scores_list, kpt_preds_list, gt_labels_list,
                    gt_keypoints_list, gt_areas_list, img_metas):
        """Compute regression and classification targets for a batch image.

        Outputs from a single decoder layer of a single feature level are used.

        Args:
            cls_scores_list (list[Tensor]): Box score logits from a single
                decoder layer for each image with shape [num_query,
                cls_out_channels].
            kpt_preds_list (list[Tensor]): Sigmoid outputs from a single
                decoder layer for each image, with normalized coordinate
                (x_{i}, y_{i}) and shape [num_query, K*2].
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (num_gts, ).
            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
                image with shape (num_gts, K*3).
            gt_areas_list (list[Tensor]): Ground truth mask areas for each
                image with shape (num_gts, ).
            img_metas (list[dict]): List of image meta information.

        Returns:
            tuple: a tuple containing the following targets.

                - labels_list (list[Tensor]): Labels for all images.
                - label_weights_list (list[Tensor]): Label weights for all
                    images.
                - kpt_targets_list (list[Tensor]): Keypoint targets for all
                    images.
                - kpt_weights_list (list[Tensor]): Keypoint weights for all
                    images.
                - area_targets_list (list[Tensor]): area targets for all
                    images.
                - num_total_pos (int): Number of positive samples in all
                    images.
                - num_total_neg (int): Number of negative samples in all
                    images.
        """
        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
         area_targets_list, pos_inds_list, neg_inds_list) = multi_apply(
             self._get_target_single, cls_scores_list, kpt_preds_list,
             gt_labels_list, gt_keypoints_list, gt_areas_list, img_metas)
        num_total_pos = sum((inds.numel() for inds in pos_inds_list))
        num_total_neg = sum((inds.numel() for inds in neg_inds_list))
        return (labels_list, label_weights_list, kpt_targets_list,
                kpt_weights_list, area_targets_list, num_total_pos,
                num_total_neg)

    def _get_target_single(self, cls_score, kpt_pred, gt_labels, gt_keypoints,
                           gt_areas, img_meta):
        """Compute regression and classification targets for one image.

        Outputs from a single decoder layer of a single feature level are used.

        Args:
            cls_score (Tensor): Box score logits from a single decoder layer
                for one image. Shape [num_query, cls_out_channels].
            kpt_pred (Tensor): Sigmoid outputs from a single decoder layer
                for one image, with normalized coordinate (x_{i}, y_{i}) and
                shape [num_query, K*2].
            gt_labels (Tensor): Ground truth class indices for one image
                with shape (num_gts, ).
            gt_keypoints (Tensor): Ground truth keypoints for one image with
                shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v, ..., \
                    p^{K}_x, p^{K}_y, p^{K}_v] format.
            gt_areas (Tensor): Ground truth mask areas for one image
                with shape (num_gts, ).
            img_meta (dict): Meta information for one image.

        Returns:
            tuple[Tensor]: a tuple containing the following for one image.

                - labels (Tensor): Labels of each image.
                - label_weights (Tensor): Label weights of each image.
                - kpt_targets (Tensor): Keypoint targets of each image.
                - kpt_weights (Tensor): Keypoint weights of each image.
                - area_targets (Tensor): Area targets of each image.
                - pos_inds (Tensor): Sampled positive indices for each image.
                - neg_inds (Tensor): Sampled negative indices for each image.
        """
        num_bboxes = kpt_pred.shape[0]
        # assigner and sampler
        assign_result = self.assigner.assign(cls_score, kpt_pred, gt_labels,
                                             gt_keypoints, gt_areas, img_meta)
        sampling_result = self.sampler.sample(assign_result, kpt_pred,
                                              gt_keypoints)

        pos_inds = sampling_result.pos_inds
        neg_inds = sampling_result.neg_inds

        # label targets
        labels = paddle.full((num_bboxes, ), self.num_classes, dtype="int64")
        label_weights = paddle.ones((num_bboxes, ), dtype=gt_labels.dtype)
        kpt_targets = paddle.zeros_like(kpt_pred)
        kpt_weights = paddle.zeros_like(kpt_pred)
        area_targets = paddle.zeros((kpt_pred.shape[0], ), dtype=kpt_pred.dtype)

        if pos_inds.size == 0:
            return (labels, label_weights, kpt_targets, kpt_weights,
                    area_targets, pos_inds, neg_inds)

        labels[pos_inds] = gt_labels[sampling_result.pos_assigned_gt_inds][
            ..., 0].astype("int64")

        img_h, img_w, _ = img_meta['img_shape']
        # keypoint targets
        pos_gt_kpts = gt_keypoints[sampling_result.pos_assigned_gt_inds]
        pos_gt_kpts = pos_gt_kpts.reshape(
            (len(sampling_result.pos_assigned_gt_inds), -1, 3))
        valid_idx = pos_gt_kpts[:, :, 2] > 0
        pos_kpt_weights = kpt_weights[pos_inds].reshape(
            (pos_gt_kpts.shape[0], kpt_weights.shape[-1] // 2, 2))
        # pos_kpt_weights[valid_idx][...] = 1.0
        pos_kpt_weights = masked_fill(pos_kpt_weights,
                                      valid_idx.unsqueeze(-1), 1.0)
        kpt_weights[pos_inds] = pos_kpt_weights.reshape(
            (pos_kpt_weights.shape[0], kpt_pred.shape[-1]))

        factor = paddle.to_tensor(
            [img_w, img_h], dtype=kpt_pred.dtype).squeeze().unsqueeze(0)
        pos_gt_kpts_normalized = pos_gt_kpts[..., :2]
        pos_gt_kpts_normalized[..., 0] = pos_gt_kpts_normalized[..., 0] / \
            factor[:, 0:1]
        pos_gt_kpts_normalized[..., 1] = pos_gt_kpts_normalized[..., 1] / \
            factor[:, 1:2]
        kpt_targets[pos_inds] = pos_gt_kpts_normalized.reshape(
            (pos_gt_kpts.shape[0], kpt_pred.shape[-1]))

        pos_gt_areas = gt_areas[sampling_result.pos_assigned_gt_inds][..., 0]
        area_targets[pos_inds] = pos_gt_areas

        return (labels, label_weights, kpt_targets, kpt_weights, area_targets,
                pos_inds, neg_inds)

    def loss_single_rpn(self, cls_scores, kpt_preds, gt_labels_list,
                        gt_keypoints_list, gt_areas_list, img_metas):
        """Loss function for outputs from a single decoder layer of a single
        feature level.

        Args:
            cls_scores (Tensor): Box score logits from a single decoder layer
                for all images. Shape [bs, num_query, cls_out_channels].
            kpt_preds (Tensor): Sigmoid outputs from a single decoder layer
                for all images, with normalized coordinate (x_{i}, y_{i}) and
                shape [bs, num_query, K*2].
            gt_labels_list (list[Tensor]): Ground truth class indices for each
                image with shape (num_gts, ).
            gt_keypoints_list (list[Tensor]): Ground truth keypoints for each
                image with shape (num_gts, K*3) in [p^{1}_x, p^{1}_y, p^{1}_v,
                ..., p^{K}_x, p^{K}_y, p^{K}_v] format.
            gt_areas_list (list[Tensor]): Ground truth mask areas for each
                image with shape (num_gts, ).
            img_metas (list[dict]): List of image meta information.

        Returns:
            dict[str, Tensor]: A dictionary of loss components for outputs from
                a single decoder layer.
        """
        num_imgs = cls_scores.shape[0]
        cls_scores_list = [cls_scores[i] for i in range(num_imgs)]
        kpt_preds_list = [kpt_preds[i] for i in range(num_imgs)]
        cls_reg_targets = self.get_targets(cls_scores_list, kpt_preds_list,
                                           gt_labels_list, gt_keypoints_list,
                                           gt_areas_list, img_metas)
        (labels_list, label_weights_list, kpt_targets_list, kpt_weights_list,
         area_targets_list, num_total_pos, num_total_neg) = cls_reg_targets
        labels = paddle.concat(labels_list, 0)
        label_weights = paddle.concat(label_weights_list, 0)
        kpt_targets = paddle.concat(kpt_targets_list, 0)
        kpt_weights = paddle.concat(kpt_weights_list, 0)

        # classification loss
        cls_scores = cls_scores.reshape((-1, self.cls_out_channels))
        # construct weighted avg_factor to match with the official DETR repo
        cls_avg_factor = num_total_pos * 1.0 + \
            num_total_neg * self.bg_cls_weight
        if self.sync_cls_avg_factor:
            cls_avg_factor = reduce_mean(
                paddle.to_tensor(
                    [cls_avg_factor], dtype=cls_scores.dtype))
        cls_avg_factor = max(cls_avg_factor, 1)

        cls_avg_factor = max(cls_avg_factor, 1)
        loss_cls = self.loss_cls(
            cls_scores, labels, label_weights, avg_factor=cls_avg_factor)

        # Compute the average number of gt keypoints accross all gpus, for
        # normalization purposes
        # num_total_pos = loss_cls.to_tensor([num_total_pos])
        # num_total_pos = paddle.clip(reduce_mean(num_total_pos), min=1).item()

        # keypoint regression loss
        kpt_preds = kpt_preds.reshape((-1, kpt_preds.shape[-1]))
        num_valid_kpt = paddle.clip(
            reduce_mean(kpt_weights.sum()), min=1).item()
        # assert num_valid_kpt == (kpt_targets>0).sum().item()
        loss_kpt = self.loss_kpt_rpn(
            kpt_preds, kpt_targets, kpt_weights, avg_factor=num_valid_kpt)

        return loss_cls, loss_kpt

    def get_bboxes(self,
                   all_cls_scores,
                   all_kpt_preds,
                   enc_cls_scores,
                   enc_kpt_preds,
                   hm_proto,
                   memory,
                   mlvl_masks,
                   img_metas,
                   rescale=False):
        """Transform network outputs for a batch into bbox predictions.

        Args:
            all_cls_scores (Tensor): Classification score of all
                decoder layers, has shape
                [nb_dec, bs, num_query, cls_out_channels].
            all_kpt_preds (Tensor): Sigmoid regression
                outputs of all decode layers. Each is a 4D-tensor with
                normalized coordinate format (x_{i}, y_{i}) and shape
                [nb_dec, bs, num_query, K*2].
            enc_cls_scores (Tensor): Classification scores of points on
                encode feature map, has shape (N, h*w, num_classes).
                Only be passed when as_two_stage is True, otherwise is None.
            enc_kpt_preds (Tensor): Regression results of each points
                on the encode feature map, has shape (N, h*w, K*2). Only be
                passed when as_two_stage is True, otherwise is None.
            img_metas (list[dict]): Meta information of each image.
            rescale (bool, optional): If True, return boxes in original
                image space. Defalut False.

        Returns:
            list[list[Tensor, Tensor]]: Each item in result_list is 3-tuple.
                The first item is an (n, 5) tensor, where the first 4 columns
                are bounding box positions (tl_x, tl_y, br_x, br_y) and the
                5-th column is a score between 0 and 1. The second item is a
                (n,) tensor where each item is the predicted class label of
                the corresponding box. The third item is an (n, K, 3) tensor
                with [p^{1}_x, p^{1}_y, p^{1}_v, ..., p^{K}_x, p^{K}_y,
                p^{K}_v] format.
        """
        cls_scores = all_cls_scores[-1]
        kpt_preds = all_kpt_preds[-1]

        result_list = []
        for img_id in range(len(img_metas)):
            cls_score = cls_scores[img_id]
            kpt_pred = kpt_preds[img_id]
            img_shape = img_metas[img_id]['img_shape']
            scale_factor = img_metas[img_id]['scale_factor']
            # TODO: only support single image test
            # memory_i = memory[:, img_id, :]
            # mlvl_mask = mlvl_masks[img_id]
            proposals = self._get_bboxes_single(cls_score, kpt_pred, img_shape,
                                                scale_factor, memory,
                                                mlvl_masks, rescale)
            result_list.append(proposals)
        return result_list

    def _get_bboxes_single(self,
                           cls_score,
                           kpt_pred,
                           img_shape,
                           scale_factor,
                           memory,
                           mlvl_masks,
                           rescale=False):
        """Transform outputs from the last decoder layer into bbox predictions
        for each image.

        Args:
            cls_score (Tensor): Box score logits from the last decoder layer
                for each image. Shape [num_query, cls_out_channels].
            kpt_pred (Tensor): Sigmoid outputs from the last decoder layer
                for each image, with coordinate format (x_{i}, y_{i}) and
                shape [num_query, K*2].
            img_shape (tuple[int]): Shape of input image, (height, width, 3).
            scale_factor (ndarray, optional): Scale factor of the image arange
                as (w_scale, h_scale, w_scale, h_scale).
            rescale (bool, optional): If True, return boxes in original image
                space. Default False.

        Returns:
            tuple[Tensor]: Results of detected bboxes and labels.

                - det_bboxes: Predicted bboxes with shape [num_query, 5],
                    where the first 4 columns are bounding box positions
                    (tl_x, tl_y, br_x, br_y) and the 5-th column are scores
                    between 0 and 1.
                - det_labels: Predicted labels of the corresponding box with
                    shape [num_query].
                - det_kpts: Predicted keypoints with shape [num_query, K, 3].
        """
        assert len(cls_score) == len(kpt_pred)
        max_per_img = self.test_cfg.get('max_per_img', self.num_query)
        # exclude background
        if self.loss_cls.use_sigmoid:
            cls_score = F.sigmoid(cls_score)
            scores, indexs = cls_score.reshape([-1]).topk(max_per_img)
            det_labels = indexs % self.num_classes
            bbox_index = indexs // self.num_classes
            kpt_pred = kpt_pred[bbox_index]
        else:
            scores, det_labels = F.softmax(cls_score, axis=-1)[..., :-1].max(-1)
            scores, bbox_index = scores.topk(max_per_img)
            kpt_pred = kpt_pred[bbox_index]
            det_labels = det_labels[bbox_index]

        # ----- results after pose decoder -----
        # det_kpts = kpt_pred.reshape((kpt_pred.shape[0], -1, 2))

        # ----- results after joint decoder (default) -----
        # import time
        # start = time.time()
        refine_targets = (kpt_pred, None, None, paddle.ones_like(kpt_pred))
        refine_outputs = self.forward_refine(memory, mlvl_masks, refine_targets,
                                             None, None)
        # end = time.time()
        # print(f'refine time: {end - start:.6f}')
        det_kpts = refine_outputs[-1]

        det_kpts[..., 0] = det_kpts[..., 0] * img_shape[1]
        det_kpts[..., 1] = det_kpts[..., 1] * img_shape[0]
        det_kpts[..., 0].clip_(min=0, max=img_shape[1])
        det_kpts[..., 1].clip_(min=0, max=img_shape[0])
        if rescale:
            det_kpts /= paddle.to_tensor(
                scale_factor[:2],
                dtype=det_kpts.dtype).unsqueeze(0).unsqueeze(0)

        # use circumscribed rectangle box of keypoints as det bboxes
        x1 = det_kpts[..., 0].min(axis=1, keepdim=True)
        y1 = det_kpts[..., 1].min(axis=1, keepdim=True)
        x2 = det_kpts[..., 0].max(axis=1, keepdim=True)
        y2 = det_kpts[..., 1].max(axis=1, keepdim=True)
        det_bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
        det_bboxes = paddle.concat((det_bboxes, scores.unsqueeze(1)), -1)

        det_kpts = paddle.concat(
            (det_kpts, paddle.ones(
                det_kpts[..., :1].shape, dtype=det_kpts.dtype)),
            axis=2)

        return det_bboxes, det_labels, det_kpts

    def simple_test(self, feats, img_metas, rescale=False):
        """Test det bboxes without test-time augmentation.

        Args:
            feats (tuple[paddle.Tensor]): Multi-level features from the
                upstream network, each is a 4D-tensor.
            img_metas (list[dict]): List of image information.
            rescale (bool, optional): Whether to rescale the results.
                Defaults to False.

        Returns:
            list[tuple[Tensor, Tensor, Tensor]]: Each item in result_list is
                3-tuple. The first item is ``bboxes`` with shape (n, 5),
                where 5 represent (tl_x, tl_y, br_x, br_y, score).
                The shape of the second tensor in the tuple is ``labels``
                with shape (n,). The third item is ``kpts`` with shape
                (n, K, 3), in [p^{1}_x, p^{1}_y, p^{1}_v, p^{K}_x, p^{K}_y,
                p^{K}_v] format.
        """
        # forward of this head requires img_metas
        outs = self.forward(feats, img_metas)
        results_list = self.get_bboxes(*outs, img_metas, rescale=rescale)
        return results_list

    def get_loss(self, boxes, scores, gt_bbox, gt_class, prior_boxes):
        return self.loss(boxes, scores, gt_bbox, gt_class, prior_boxes)