retina_head.py

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np

import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.initializer import Normal, Constant
from paddle.fluid.regularizer import L2Decay
from ppdet.modeling.ops import (AnchorGenerator, RetinaTargetAssign,
                                RetinaOutputDecoder)

from ppdet.core.workspace import register

__all__ = ['RetinaHead']


@register
class RetinaHead(object):
    """
    Retina Head

    Args:
        anchor_generator (object): `AnchorGenerator` instance
        target_assign (object): `RetinaTargetAssign` instance
        output_decoder (object): `RetinaOutputDecoder` instance
        num_convs_per_octave (int): Number of convolution layers in each octave
        num_chan (int): Number of octave output channels
        max_level (int): Highest level of FPN output
        min_level (int): Lowest level of FPN output
        prior_prob (float): Used to set the bias init for the class prediction layer
        base_scale (int): Anchors are generated based on this scale
        num_scales_per_octave (int): Number of anchor scales per octave
        num_classes (int): Number of classes
        gamma (float): The parameter in focal loss
        alpha (float): The parameter in focal loss
        sigma (float): The parameter in smooth l1 loss
    """
    __inject__ = ['anchor_generator', 'target_assign', 'output_decoder']
    __shared__ = ['num_classes']

    def __init__(self,
                 anchor_generator=AnchorGenerator().__dict__,
                 target_assign=RetinaTargetAssign().__dict__,
                 output_decoder=RetinaOutputDecoder().__dict__,
                 num_convs_per_octave=4,
                 num_chan=256,
                 max_level=7,
                 min_level=3,
                 prior_prob=0.01,
                 base_scale=4,
                 num_scales_per_octave=3,
                 num_classes=81,
                 gamma=2.0,
                 alpha=0.25,
                 sigma=3.0151134457776365):
        self.anchor_generator = anchor_generator
        self.target_assign = target_assign
        self.output_decoder = output_decoder
        self.num_convs_per_octave = num_convs_per_octave
        self.num_chan = num_chan
        self.max_level = max_level
        self.min_level = min_level
        self.prior_prob = prior_prob
        self.base_scale = base_scale
        self.num_scales_per_octave = num_scales_per_octave
        self.num_classes = num_classes
        self.gamma = gamma
        self.alpha = alpha
        self.sigma = sigma
        if isinstance(anchor_generator, dict):
            self.anchor_generator = AnchorGenerator(**anchor_generator)
        if isinstance(target_assign, dict):
            self.target_assign = RetinaTargetAssign(**target_assign)
        if isinstance(output_decoder, dict):
            self.output_decoder = RetinaOutputDecoder(**output_decoder)

    def _class_subnet(self, body_feats, spatial_scale):
        """
        Get class predictions of all level FPN level.

        Args:
            fpn_dict(dict): A dictionary represents the output of FPN with
                their name.
            spatial_scale(list): A list of multiplicative spatial scale factor.

        Returns:
            cls_pred_input(list): Class prediction of all input fpn levels.
        """
        assert len(body_feats) == self.max_level - self.min_level + 1
        fpn_name_list = list(body_feats.keys())
        cls_pred_list = []
        for lvl in range(self.min_level, self.max_level + 1):
            fpn_name = fpn_name_list[self.max_level - lvl]
            subnet_blob = body_feats[fpn_name]
            for i in range(self.num_convs_per_octave):
                conv_name = 'retnet_cls_conv_n{}_fpn{}'.format(i, lvl)
                conv_share_name = 'retnet_cls_conv_n{}_fpn{}'.format(
                    i, self.min_level)
                subnet_blob_in = subnet_blob
                subnet_blob = fluid.layers.conv2d(
                    input=subnet_blob_in,
                    num_filters=self.num_chan,
                    filter_size=3,
                    stride=1,
                    padding=1,
                    act='relu',
                    name=conv_name,
                    param_attr=ParamAttr(
                        name=conv_share_name + '_w',
                        initializer=Normal(
                            loc=0., scale=0.01)),
                    bias_attr=ParamAttr(
                        name=conv_share_name + '_b',
                        learning_rate=2.,
                        regularizer=L2Decay(0.)))

            # class prediction
            cls_name = 'retnet_cls_pred_fpn{}'.format(lvl)
            cls_share_name = 'retnet_cls_pred_fpn{}'.format(self.min_level)
            num_anchors = self.num_scales_per_octave * len(
                self.anchor_generator.aspect_ratios)
            cls_dim = num_anchors * (self.num_classes - 1)
            # bias initialization: b = -log((1 - pai) / pai)
            bias_init = float(-np.log((1 - self.prior_prob) / self.prior_prob))
            out_cls = fluid.layers.conv2d(
                input=subnet_blob,
                num_filters=cls_dim,
                filter_size=3,
                stride=1,
                padding=1,
                act=None,
                name=cls_name,
                param_attr=ParamAttr(
                    name=cls_share_name + '_w',
                    initializer=Normal(
                        loc=0., scale=0.01)),
                bias_attr=ParamAttr(
                    name=cls_share_name + '_b',
                    initializer=Constant(value=bias_init),
                    learning_rate=2.,
                    regularizer=L2Decay(0.)))
            cls_pred_list.append(out_cls)

        return cls_pred_list

    def _bbox_subnet(self, body_feats, spatial_scale):
        """
        Get bounding box predictions of all level FPN level.

        Args:
            fpn_dict(dict): A dictionary represents the output of FPN with
                their name.
            spatial_scale(list): A list of multiplicative spatial scale factor.

        Returns:
            bbox_pred_input(list): Bounding box prediction of all input fpn
                levels.
        """
        assert len(body_feats) == self.max_level - self.min_level + 1
        fpn_name_list = list(body_feats.keys())
        bbox_pred_list = []
        for lvl in range(self.min_level, self.max_level + 1):
            fpn_name = fpn_name_list[self.max_level - lvl]
            subnet_blob = body_feats[fpn_name]
            for i in range(self.num_convs_per_octave):
                conv_name = 'retnet_bbox_conv_n{}_fpn{}'.format(i, lvl)
                conv_share_name = 'retnet_bbox_conv_n{}_fpn{}'.format(
                    i, self.min_level)
                subnet_blob_in = subnet_blob
                subnet_blob = fluid.layers.conv2d(
                    input=subnet_blob_in,
                    num_filters=self.num_chan,
                    filter_size=3,
                    stride=1,
                    padding=1,
                    act='relu',
                    name=conv_name,
                    param_attr=ParamAttr(
                        name=conv_share_name + '_w',
                        initializer=Normal(
                            loc=0., scale=0.01)),
                    bias_attr=ParamAttr(
                        name=conv_share_name + '_b',
                        learning_rate=2.,
                        regularizer=L2Decay(0.)))

            # bbox prediction
            bbox_name = 'retnet_bbox_pred_fpn{}'.format(lvl)
            bbox_share_name = 'retnet_bbox_pred_fpn{}'.format(self.min_level)
            num_anchors = self.num_scales_per_octave * len(
                self.anchor_generator.aspect_ratios)
            bbox_dim = num_anchors * 4
            out_bbox = fluid.layers.conv2d(
                input=subnet_blob,
                num_filters=bbox_dim,
                filter_size=3,
                stride=1,
                padding=1,
                act=None,
                name=bbox_name,
                param_attr=ParamAttr(
                    name=bbox_share_name + '_w',
                    initializer=Normal(
                        loc=0., scale=0.01)),
                bias_attr=ParamAttr(
                    name=bbox_share_name + '_b',
                    learning_rate=2.,
                    regularizer=L2Decay(0.)))
            bbox_pred_list.append(out_bbox)
        return bbox_pred_list

    def _anchor_generate(self, body_feats, spatial_scale):
        """
        Get anchor boxes of all level FPN level.

        Args:
            fpn_dict(dict): A dictionary represents the output of FPN with
                their name.
            spatial_scale(list): A list of multiplicative spatial scale factor.

        Return:
            anchor_input(list): Anchors of all input fpn levels with shape of.
            anchor_var_input(list): Anchor variance of all input fpn levels with
                shape.
        """
        assert len(body_feats) == self.max_level - self.min_level + 1
        fpn_name_list = list(body_feats.keys())
        anchor_list = []
        anchor_var_list = []
        for lvl in range(self.min_level, self.max_level + 1):
            anchor_sizes = []
            stride = int(1 / spatial_scale[self.max_level - lvl])
            for octave in range(self.num_scales_per_octave):
                anchor_size = stride * (
                    2**(float(octave) /
                        float(self.num_scales_per_octave))) * self.base_scale
                anchor_sizes.append(anchor_size)
            fpn_name = fpn_name_list[self.max_level - lvl]
            anchor, anchor_var = self.anchor_generator(
                input=body_feats[fpn_name],
                anchor_sizes=anchor_sizes,
                aspect_ratios=self.anchor_generator.aspect_ratios,
                stride=[stride, stride])
            anchor_list.append(anchor)
            anchor_var_list.append(anchor_var)
        return anchor_list, anchor_var_list

    def _get_output(self, body_feats, spatial_scale):
        """
        Get class, bounding box predictions and anchor boxes of all level FPN level.

        Args:
            fpn_dict(dict): A dictionary represents the output of FPN with
                their name.
            spatial_scale(list): A list of multiplicative spatial scale factor.

        Returns:
            cls_pred_input(list): Class prediction of all input fpn levels.
            bbox_pred_input(list): Bounding box prediction of all input fpn
                levels.
            anchor_input(list): Anchors of all input fpn levels with shape of.
            anchor_var_input(list): Anchor variance of all input fpn levels with
                shape.
        """
        assert len(body_feats) == self.max_level - self.min_level + 1
        # class subnet
        cls_pred_list = self._class_subnet(body_feats, spatial_scale)
        # bbox subnet
        bbox_pred_list = self._bbox_subnet(body_feats, spatial_scale)
        #generate anchors
        anchor_list, anchor_var_list = self._anchor_generate(body_feats,
                                                             spatial_scale)
        cls_pred_reshape_list = []
        bbox_pred_reshape_list = []
        anchor_reshape_list = []
        anchor_var_reshape_list = []
        for i in range(self.max_level - self.min_level + 1):
            cls_pred_transpose = fluid.layers.transpose(
                cls_pred_list[i], perm=[0, 2, 3, 1])
            cls_pred_reshape = fluid.layers.reshape(
                cls_pred_transpose, shape=(0, -1, self.num_classes - 1))
            bbox_pred_transpose = fluid.layers.transpose(
                bbox_pred_list[i], perm=[0, 2, 3, 1])
            bbox_pred_reshape = fluid.layers.reshape(
                bbox_pred_transpose, shape=(0, -1, 4))
            anchor_reshape = fluid.layers.reshape(anchor_list[i], shape=(-1, 4))
            anchor_var_reshape = fluid.layers.reshape(
                anchor_var_list[i], shape=(-1, 4))
            cls_pred_reshape_list.append(cls_pred_reshape)
            bbox_pred_reshape_list.append(bbox_pred_reshape)
            anchor_reshape_list.append(anchor_reshape)
            anchor_var_reshape_list.append(anchor_var_reshape)
        output = {}
        output['cls_pred'] = cls_pred_reshape_list
        output['bbox_pred'] = bbox_pred_reshape_list
        output['anchor'] = anchor_reshape_list
        output['anchor_var'] = anchor_var_reshape_list
        return output

    def get_prediction(self, body_feats, spatial_scale, im_info):
        """
        Get prediction bounding box in test stage.

        Args:
            fpn_dict(dict): A dictionary represents the output of FPN with
                their name.
            spatial_scale(list): A list of multiplicative spatial scale factor.
            im_info (Variable): A 2-D LoDTensor with shape [B, 3]. B is the
                number of input images, each element consists of im_height,
                im_width, im_scale.

        Returns:
            pred_result(Variable): Prediction result with shape [N, 6]. Each
                row has 6 values: [label, confidence, xmin, ymin, xmax, ymax].
                N is the total number of prediction.
        """
        output = self._get_output(body_feats, spatial_scale)
        cls_pred_reshape_list = output['cls_pred']
        bbox_pred_reshape_list = output['bbox_pred']
        anchor_reshape_list = output['anchor']
        for i in range(self.max_level - self.min_level + 1):
            cls_pred_reshape_list[i] = fluid.layers.sigmoid(
                cls_pred_reshape_list[i])
        pred_result = self.output_decoder(
            bboxes=bbox_pred_reshape_list,
            scores=cls_pred_reshape_list,
            anchors=anchor_reshape_list,
            im_info=im_info)
        return {'bbox': pred_result}

    def get_loss(self, body_feats, spatial_scale, im_info, gt_box, gt_label,
                 is_crowd):
        """
        Calculate the loss of retinanet.
        Args:
            fpn_dict(dict): A dictionary represents the output of FPN with
                their name.
            spatial_scale(list): A list of multiplicative spatial scale factor.
            im_info(Variable): A 2-D LoDTensor with shape [B, 3]. B is the
                number of input images, each element consists of im_height,
                im_width, im_scale.
            gt_box(Variable): The ground-truth bounding boxes with shape [M, 4].
                M is the number of groundtruth.
            gt_label(Variable): The ground-truth labels with shape [M, 1].
                M is the number of groundtruth.
            is_crowd(Variable): Indicates groud-truth is crowd or not with
                shape [M, 1]. M is the number of groundtruth.

        Returns:
            Type: dict
                loss_cls(Variable): focal loss.
                loss_bbox(Variable): smooth l1 loss.
        """
        output = self._get_output(body_feats, spatial_scale)
        cls_pred_reshape_list = output['cls_pred']
        bbox_pred_reshape_list = output['bbox_pred']
        anchor_reshape_list = output['anchor']
        anchor_var_reshape_list = output['anchor_var']

        cls_pred_input = fluid.layers.concat(cls_pred_reshape_list, axis=1)
        bbox_pred_input = fluid.layers.concat(bbox_pred_reshape_list, axis=1)
        anchor_input = fluid.layers.concat(anchor_reshape_list, axis=0)
        anchor_var_input = fluid.layers.concat(anchor_var_reshape_list, axis=0)
        score_pred, loc_pred, score_tgt, loc_tgt, bbox_weight, fg_num = \
            self.target_assign(
                bbox_pred=bbox_pred_input,
                cls_logits=cls_pred_input,
                anchor_box=anchor_input,
                anchor_var=anchor_var_input,
                gt_boxes=gt_box,
                gt_labels=gt_label,
                is_crowd=is_crowd,
                im_info=im_info,
                num_classes=self.num_classes - 1)
        fg_num = fluid.layers.reduce_sum(fg_num, name='fg_num')
        score_tgt = fluid.layers.cast(score_tgt, 'int32')
        loss_cls = fluid.layers.sigmoid_focal_loss(
            x=score_pred,
            label=score_tgt,
            fg_num=fg_num,
            gamma=self.gamma,
            alpha=self.alpha)
        loss_cls = fluid.layers.reduce_sum(loss_cls, name='loss_cls')
        loss_bbox = fluid.layers.smooth_l1(
            x=loc_pred,
            y=loc_tgt,
            sigma=self.sigma,
            inside_weight=bbox_weight,
            outside_weight=bbox_weight)
        loss_bbox = fluid.layers.reduce_sum(loss_bbox, name='loss_bbox')
        loss_bbox = loss_bbox / fg_num
        return {'loss_cls': loss_cls, 'loss_bbox': loss_bbox}