cascade_head.py

# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from paddle.fluid.initializer import Normal, Xavier
from paddle.fluid.regularizer import L2Decay
from paddle.fluid.initializer import MSRA

from ppdet.modeling.ops import MultiClassNMS
from ppdet.modeling.ops import ConvNorm
from ppdet.core.workspace import register

__all__ = ['CascadeBBoxHead']


@register
class CascadeBBoxHead(object):
    """
    Cascade RCNN bbox head

    Args:
        head (object): the head module instance
        nms (object): `MultiClassNMS` instance
        num_classes: number of output classes
    """
    __inject__ = ['head', 'nms']
    __shared__ = ['num_classes']

    def __init__(self, head, nms=MultiClassNMS().__dict__, num_classes=81):
        super(CascadeBBoxHead, self).__init__()
        self.head = head
        self.nms = nms
        self.num_classes = num_classes
        if isinstance(nms, dict):
            self.nms = MultiClassNMS(**nms)

    def get_output(self,
                   roi_feat,
                   cls_agnostic_bbox_reg=2,
                   wb_scalar=1.0,
                   name=''):
        """
        Get bbox head output.

        Args:
            roi_feat (Variable): RoI feature from RoIExtractor.
            cls_agnostic_bbox_reg(Int): BBox regressor are class agnostic.
            wb_scalar(Float): Weights and Bias's learning rate.
            name(String): Layer's name

        Returns:
            cls_score(Variable): cls score.
            bbox_pred(Variable): bbox regression.
        """
        head_feat = self.head(roi_feat, wb_scalar, name)
        cls_score = fluid.layers.fc(input=head_feat,
                                    size=self.num_classes,
                                    act=None,
                                    name='cls_score' + name,
                                    param_attr=ParamAttr(
                                        name='cls_score%s_w' % name,
                                        initializer=Normal(
                                            loc=0.0, scale=0.01),
                                        learning_rate=wb_scalar),
                                    bias_attr=ParamAttr(
                                        name='cls_score%s_b' % name,
                                        learning_rate=wb_scalar * 2,
                                        regularizer=L2Decay(0.)))
        bbox_pred = fluid.layers.fc(input=head_feat,
                                    size=4 * cls_agnostic_bbox_reg,
                                    act=None,
                                    name='bbox_pred' + name,
                                    param_attr=ParamAttr(
                                        name='bbox_pred%s_w' % name,
                                        initializer=Normal(
                                            loc=0.0, scale=0.001),
                                        learning_rate=wb_scalar),
                                    bias_attr=ParamAttr(
                                        name='bbox_pred%s_b' % name,
                                        learning_rate=wb_scalar * 2,
                                        regularizer=L2Decay(0.)))
        return cls_score, bbox_pred

    def get_loss(self, rcnn_pred_list, rcnn_target_list, rcnn_loss_weight_list):
        """
        Get bbox_head loss.

        Args:
            rcnn_pred_list(List): Cascade RCNN's head's output including
                bbox_pred and cls_score
            rcnn_target_list(List): Cascade rcnn's bbox and label target
            rcnn_loss_weight_list(List): The weight of location and class loss

        Return:
            loss_cls(Variable): bbox_head loss.
            loss_bbox(Variable): bbox_head loss.
        """
        loss_dict = {}
        for i, (rcnn_pred, rcnn_target
                ) in enumerate(zip(rcnn_pred_list, rcnn_target_list)):
            labels_int64 = fluid.layers.cast(x=rcnn_target[1], dtype='int64')
            labels_int64.stop_gradient = True

            loss_cls = fluid.layers.softmax_with_cross_entropy(
                logits=rcnn_pred[0],
                label=labels_int64,
                numeric_stable_mode=True, )
            loss_cls = fluid.layers.reduce_mean(
                loss_cls, name='loss_cls_' + str(i)) * rcnn_loss_weight_list[i]

            loss_bbox = fluid.layers.smooth_l1(
                x=rcnn_pred[1],
                y=rcnn_target[2],
                inside_weight=rcnn_target[3],
                outside_weight=rcnn_target[4],
                sigma=1.0,  # detectron use delta = 1./sigma**2
            )
            loss_bbox = fluid.layers.reduce_mean(
                loss_bbox,
                name='loss_bbox_' + str(i)) * rcnn_loss_weight_list[i]

            loss_dict['loss_cls_%d' % i] = loss_cls
            loss_dict['loss_loc_%d' % i] = loss_bbox

        return loss_dict

    def get_prediction(self,
                       im_info,
                       im_shape,
                       roi_feat_list,
                       rcnn_pred_list,
                       proposal_list,
                       cascade_bbox_reg_weights,
                       cls_agnostic_bbox_reg=2,
                       return_box_score=False):
        """
        Get prediction bounding box in test stage.
        :
        Args:
            im_info (Variable): A 2-D LoDTensor with shape [B, 3]. B is the
                number of input images, each element consists
                of im_height, im_width, im_scale.
            im_shape (Variable): Actual shape of original image with shape
                [B, 3]. B is the number of images, each element consists of
                original_height, original_width, 1
            rois_feat_list (List): RoI feature from RoIExtractor.
            rcnn_pred_list (Variable): Cascade rcnn's head's output
                including bbox_pred and cls_score
            proposal_list (List): RPN proposal boxes.
            cascade_bbox_reg_weights (List): BBox decode var.
            cls_agnostic_bbox_reg(Int): BBox regressor are class agnostic

        Returns:
            pred_result(Variable): Prediction result with shape [N, 6]. Each
               row has 6 values: [label, confidence, xmin, ymin, xmax, ymax].
               N is the total number of prediction.
        """
        self.im_scale = fluid.layers.slice(im_info, [1], starts=[2], ends=[3])
        boxes_cls_prob_l = []

        rcnn_pred = rcnn_pred_list[-1]  # stage 3
        repreat_num = 1
        repreat_num = 3
        bbox_reg_w = cascade_bbox_reg_weights[-1]
        for i in range(repreat_num):
            # cls score
            if i < 2:
                cls_score, _ = self.get_output(
                    roi_feat_list[-1],  # roi_feat_3
                    name='_' + str(i + 1) if i > 0 else '')
            else:
                cls_score = rcnn_pred[0]
            cls_prob = fluid.layers.softmax(cls_score, use_cudnn=False)
            boxes_cls_prob_l.append(cls_prob)

        boxes_cls_prob_mean = (
            boxes_cls_prob_l[0] + boxes_cls_prob_l[1] + boxes_cls_prob_l[2]
        ) / 3.0

        # bbox pred
        proposals_boxes = proposal_list[-1]
        im_scale_lod = fluid.layers.sequence_expand(self.im_scale,
                                                    proposals_boxes)
        proposals_boxes = proposals_boxes / im_scale_lod
        bbox_pred = rcnn_pred[1]
        bbox_pred_new = fluid.layers.reshape(bbox_pred,
                                             (-1, cls_agnostic_bbox_reg, 4))
        if cls_agnostic_bbox_reg == 2:
            # only use fg box delta to decode box
            bbox_pred_new = fluid.layers.slice(
                bbox_pred_new, axes=[1], starts=[1], ends=[2])
            bbox_pred_new = fluid.layers.expand(bbox_pred_new,
                                                [1, self.num_classes, 1])
        decoded_box = fluid.layers.box_coder(
            prior_box=proposals_boxes,
            prior_box_var=bbox_reg_w,
            target_box=bbox_pred_new,
            code_type='decode_center_size',
            box_normalized=False,
            axis=1)

        box_out = fluid.layers.box_clip(input=decoded_box, im_info=im_shape)
        if return_box_score:
            return {'bbox': box_out, 'score': boxes_cls_prob_mean}
        pred_result = self.nms(bboxes=box_out, scores=boxes_cls_prob_mean)
        return {"bbox": pred_result}

    def get_prediction_cls_aware(self, im_info, im_shape, cascade_cls_prob,
                                 cascade_decoded_box, cascade_bbox_reg_weights):
        '''
        get_prediction_cls_aware: predict bbox for each class
        '''
        cascade_num_stage = 3
        cascade_eval_weight = [0.2, 0.3, 0.5]
        # merge 3 stages results
        sum_cascade_cls_prob = sum([
            prob * cascade_eval_weight[idx]
            for idx, prob in enumerate(cascade_cls_prob)
        ])
        sum_cascade_decoded_box = sum([
            bbox * cascade_eval_weight[idx]
            for idx, bbox in enumerate(cascade_decoded_box)
        ])
        self.im_scale = fluid.layers.slice(im_info, [1], starts=[2], ends=[3])
        im_scale_lod = fluid.layers.sequence_expand(self.im_scale,
                                                    sum_cascade_decoded_box)

        sum_cascade_decoded_box = sum_cascade_decoded_box / im_scale_lod

        decoded_bbox = sum_cascade_decoded_box
        decoded_bbox = fluid.layers.reshape(
            decoded_bbox, shape=(-1, self.num_classes, 4))

        box_out = fluid.layers.box_clip(input=decoded_bbox, im_info=im_shape)
        pred_result = self.nms(bboxes=box_out, scores=sum_cascade_cls_prob)
        return {"bbox": pred_result}


@register
class CascadeXConvNormHead(object):
    """
    RCNN head with serveral convolution layers

    Args:
        conv_num (int): num of convolution layers for the rcnn head
        conv_dim (int): num of filters for the conv layers
        mlp_dim (int): num of filters for the fc layers
    """
    __shared__ = ['norm_type', 'freeze_norm']

    def __init__(self,
                 num_conv=4,
                 conv_dim=256,
                 mlp_dim=1024,
                 norm_type=None,
                 freeze_norm=False):
        super(CascadeXConvNormHead, self).__init__()
        self.conv_dim = conv_dim
        self.mlp_dim = mlp_dim
        self.num_conv = num_conv
        self.norm_type = norm_type
        self.freeze_norm = freeze_norm

    def __call__(self, roi_feat, wb_scalar=1.0, name=''):
        conv = roi_feat
        fan = self.conv_dim * 3 * 3
        initializer = MSRA(uniform=False, fan_in=fan)
        for i in range(self.num_conv):
            name = 'bbox_head_conv' + str(i)
            conv = ConvNorm(
                conv,
                self.conv_dim,
                3,
                act='relu',
                initializer=initializer,
                norm_type=self.norm_type,
                freeze_norm=self.freeze_norm,
                lr_scale=wb_scalar,
                name=name,
                norm_name=name)
        fan = conv.shape[1] * conv.shape[2] * conv.shape[3]
        head_heat = fluid.layers.fc(input=conv,
                                    size=self.mlp_dim,
                                    act='relu',
                                    name='fc6' + name,
                                    param_attr=ParamAttr(
                                        name='fc6%s_w' % name,
                                        initializer=Xavier(fan_out=fan),
                                        learning_rate=wb_scalar),
                                    bias_attr=ParamAttr(
                                        name='fc6%s_b' % name,
                                        regularizer=L2Decay(0.),
                                        learning_rate=wb_scalar * 2))
        return head_heat


@register
class CascadeTwoFCHead(object):
    """
    RCNN head with serveral convolution layers

    Args:
        mlp_dim (int): num of filters for the fc layers
    """

    def __init__(self, mlp_dim):
        super(CascadeTwoFCHead, self).__init__()
        self.mlp_dim = mlp_dim

    def __call__(self, roi_feat, wb_scalar=1.0, name=''):
        fan = roi_feat.shape[1] * roi_feat.shape[2] * roi_feat.shape[3]
        fc6 = fluid.layers.fc(input=roi_feat,
                              size=self.mlp_dim,
                              act='relu',
                              name='fc6' + name,
                              param_attr=ParamAttr(
                                  name='fc6%s_w' % name,
                                  initializer=Xavier(fan_out=fan),
                                  learning_rate=wb_scalar),
                              bias_attr=ParamAttr(
                                  name='fc6%s_b' % name,
                                  learning_rate=wb_scalar * 2,
                                  regularizer=L2Decay(0.)))
        head_feat = fluid.layers.fc(input=fc6,
                                    size=self.mlp_dim,
                                    act='relu',
                                    name='fc7' + name,
                                    param_attr=ParamAttr(
                                        name='fc7%s_w' % name,
                                        initializer=Xavier(),
                                        learning_rate=wb_scalar),
                                    bias_attr=ParamAttr(
                                        name='fc7%s_b' % name,
                                        learning_rate=wb_scalar * 2,
                                        regularizer=L2Decay(0.)))
        return head_feat