post_process.py 28.9 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

Q
qingqing01 已提交
15 16 17 18 19
import numpy as np
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from ppdet.core.workspace import register
W
wangxinxin08 已提交
20
from ppdet.modeling.bbox_utils import nonempty_bbox
21
from .transformers import bbox_cxcywh_to_xyxy
W
wangguanzhong 已提交
22 23 24 25
try:
    from collections.abc import Sequence
except Exception:
    from collections import Sequence
Q
qingqing01 已提交
26

27
__all__ = [
F
Feng Ni 已提交
28
    'BBoxPostProcess', 'MaskPostProcess', 'JDEBBoxPostProcess',
29
    'CenterNetPostProcess', 'DETRPostProcess', 'SparsePostProcess'
30
]
F
Feng Ni 已提交
31

Q
qingqing01 已提交
32 33

@register
G
Guanghua Yu 已提交
34
class BBoxPostProcess(object):
35
    __shared__ = ['num_classes', 'export_onnx', 'export_eb']
Q
qingqing01 已提交
36 37
    __inject__ = ['decode', 'nms']

F
Feng Ni 已提交
38 39 40 41 42 43
    def __init__(self,
                 num_classes=80,
                 decode=None,
                 nms=None,
                 export_onnx=False,
                 export_eb=False):
Q
qingqing01 已提交
44
        super(BBoxPostProcess, self).__init__()
45
        self.num_classes = num_classes
Q
qingqing01 已提交
46 47
        self.decode = decode
        self.nms = nms
48
        self.export_onnx = export_onnx
49
        self.export_eb = export_eb
Q
qingqing01 已提交
50

G
Guanghua Yu 已提交
51
    def __call__(self, head_out, rois, im_shape, scale_factor):
52
        """
G
Guanghua Yu 已提交
53
        Decode the bbox and do NMS if needed.
54

F
Feng Ni 已提交
55 56 57 58 59
        Args:
            head_out (tuple): bbox_pred and cls_prob of bbox_head output.
            rois (tuple): roi and rois_num of rpn_head output.
            im_shape (Tensor): The shape of the input image.
            scale_factor (Tensor): The scale factor of the input image.
60
            export_onnx (bool): whether export model to onnx
61
        Returns:
F
Feng Ni 已提交
62 63 64 65 66
            bbox_pred (Tensor): The output prediction with shape [N, 6], including
                labels, scores and bboxes. The size of bboxes are corresponding
                to the input image, the bboxes may be used in other branch.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [1], and is N.
67
        """
F
Feng Ni 已提交
68 69
        if self.nms is not None:
            bboxes, score = self.decode(head_out, rois, im_shape, scale_factor)
Z
Zhao-Yian 已提交
70 71
            bbox_pred, bbox_num, before_nms_indexes = self.nms(bboxes, score,
                                                               self.num_classes)
72

F
Feng Ni 已提交
73 74 75
        else:
            bbox_pred, bbox_num = self.decode(head_out, rois, im_shape,
                                              scale_factor)
76 77 78 79 80 81 82 83 84 85

        if self.export_onnx:
            # add fake box after postprocess when exporting onnx 
            fake_bboxes = paddle.to_tensor(
                np.array(
                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))

            bbox_pred = paddle.concat([bbox_pred, fake_bboxes])
            bbox_num = bbox_num + 1

X
xs1997zju 已提交
86 87 88 89
        if self.nms is not None:
            return bbox_pred, bbox_num, before_nms_indexes
        else:
            return bbox_pred, bbox_num
Q
qingqing01 已提交
90

91 92 93
    def get_pred(self, bboxes, bbox_num, im_shape, scale_factor):
        """
        Rescale, clip and filter the bbox from the output of NMS to 
F
Feng Ni 已提交
94
        get final prediction. 
G
Guanghua Yu 已提交
95

F
Feng Ni 已提交
96 97
        Notes:
        Currently only support bs = 1.
98 99

        Args:
G
Guanghua Yu 已提交
100
            bboxes (Tensor): The output bboxes with shape [N, 6] after decode
F
Feng Ni 已提交
101 102 103 104 105
                and NMS, including labels, scores and bboxes.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [1], and is N.
            im_shape (Tensor): The shape of the input image.
            scale_factor (Tensor): The scale factor of the input image.
106
        Returns:
F
Feng Ni 已提交
107 108
            pred_result (Tensor): The final prediction results with shape [N, 6]
                including labels, scores and bboxes.
109
        """
110 111 112 113
        if self.export_eb:
            # enable rcnn models for edgeboard hw to skip the following postprocess.
            return bboxes, bboxes, bbox_num

114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135
        if not self.export_onnx:
            bboxes_list = []
            bbox_num_list = []
            id_start = 0
            fake_bboxes = paddle.to_tensor(
                np.array(
                    [[0., 0.0, 0.0, 0.0, 1.0, 1.0]], dtype='float32'))
            fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))

            # add fake bbox when output is empty for each batch
            for i in range(bbox_num.shape[0]):
                if bbox_num[i] == 0:
                    bboxes_i = fake_bboxes
                    bbox_num_i = fake_bbox_num
                else:
                    bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
                    bbox_num_i = bbox_num[i]
                    id_start += bbox_num[i]
                bboxes_list.append(bboxes_i)
                bbox_num_list.append(bbox_num_i)
            bboxes = paddle.concat(bboxes_list)
            bbox_num = paddle.concat(bbox_num_list)
W
wangguanzhong 已提交
136

137 138
        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)

139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
        if not self.export_onnx:
            origin_shape_list = []
            scale_factor_list = []
            # scale_factor: scale_y, scale_x
            for i in range(bbox_num.shape[0]):
                expand_shape = paddle.expand(origin_shape[i:i + 1, :],
                                             [bbox_num[i], 2])
                scale_y, scale_x = scale_factor[i][0], scale_factor[i][1]
                scale = paddle.concat([scale_x, scale_y, scale_x, scale_y])
                expand_scale = paddle.expand(scale, [bbox_num[i], 4])
                origin_shape_list.append(expand_shape)
                scale_factor_list.append(expand_scale)

            self.origin_shape_list = paddle.concat(origin_shape_list)
            scale_factor_list = paddle.concat(scale_factor_list)

        else:
            # simplify the computation for bs=1 when exporting onnx
            scale_y, scale_x = scale_factor[0][0], scale_factor[0][1]
            scale = paddle.concat(
                [scale_x, scale_y, scale_x, scale_y]).unsqueeze(0)
            self.origin_shape_list = paddle.expand(origin_shape,
                                                   [bbox_num[0], 2])
            scale_factor_list = paddle.expand(scale, [bbox_num[0], 4])
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184

        # bboxes: [N, 6], label, score, bbox
        pred_label = bboxes[:, 0:1]
        pred_score = bboxes[:, 1:2]
        pred_bbox = bboxes[:, 2:]
        # rescale bbox to original image
        scaled_bbox = pred_bbox / scale_factor_list
        origin_h = self.origin_shape_list[:, 0]
        origin_w = self.origin_shape_list[:, 1]
        zeros = paddle.zeros_like(origin_h)
        # clip bbox to [0, original_size]
        x1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 0], origin_w), zeros)
        y1 = paddle.maximum(paddle.minimum(scaled_bbox[:, 1], origin_h), zeros)
        x2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 2], origin_w), zeros)
        y2 = paddle.maximum(paddle.minimum(scaled_bbox[:, 3], origin_h), zeros)
        pred_bbox = paddle.stack([x1, y1, x2, y2], axis=-1)
        # filter empty bbox
        keep_mask = nonempty_bbox(pred_bbox, return_mask=True)
        keep_mask = paddle.unsqueeze(keep_mask, [1])
        pred_label = paddle.where(keep_mask, pred_label,
                                  paddle.ones_like(pred_label) * -1)
        pred_result = paddle.concat([pred_label, pred_score, pred_bbox], axis=1)
W
wangguanzhong 已提交
185
        return bboxes, pred_result, bbox_num
186 187 188 189

    def get_origin_shape(self, ):
        return self.origin_shape_list

Q
qingqing01 已提交
190 191 192

@register
class MaskPostProcess(object):
193
    __shared__ = ['export_onnx', 'assign_on_cpu']
W
wangguanzhong 已提交
194 195 196 197 198 199 200
    """
    refer to:
    https://github.com/facebookresearch/detectron2/layers/mask_ops.py

    Get Mask output according to the output from model
    """

201 202 203 204
    def __init__(self,
                 binary_thresh=0.5,
                 export_onnx=False,
                 assign_on_cpu=False):
Q
qingqing01 已提交
205 206
        super(MaskPostProcess, self).__init__()
        self.binary_thresh = binary_thresh
W
wangguanzhong 已提交
207
        self.export_onnx = export_onnx
208
        self.assign_on_cpu = assign_on_cpu
Q
qingqing01 已提交
209

210 211
    def __call__(self, mask_out, bboxes, bbox_num, origin_shape):
        """
F
Feng Ni 已提交
212 213 214 215 216 217 218 219 220 221 222 223 224
        Decode the mask_out and paste the mask to the origin image.

        Args:
            mask_out (Tensor): mask_head output with shape [N, 28, 28].
            bbox_pred (Tensor): The output bboxes with shape [N, 6] after decode
                and NMS, including labels, scores and bboxes.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [1], and is N.
            origin_shape (Tensor): The origin shape of the input image, the tensor
                shape is [N, 2], and each row is [h, w].
        Returns:
            pred_result (Tensor): The final prediction mask results with shape
                [N, h, w] in binary mask style.
225 226
        """
        num_mask = mask_out.shape[0]
G
Guanghua Yu 已提交
227
        origin_shape = paddle.cast(origin_shape, 'int32')
228
        device = paddle.device.get_device()
W
wangguanzhong 已提交
229 230 231

        if self.export_onnx:
            h, w = origin_shape[0][0], origin_shape[0][1]
U
ucsk 已提交
232 233
            mask_onnx = paste_mask(mask_out[:, None, :, :], bboxes[:, 2:], h, w,
                                   self.assign_on_cpu)
W
wangguanzhong 已提交
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
            mask_onnx = mask_onnx >= self.binary_thresh
            pred_result = paddle.cast(mask_onnx, 'int32')

        else:
            max_h = paddle.max(origin_shape[:, 0])
            max_w = paddle.max(origin_shape[:, 1])
            pred_result = paddle.zeros(
                [num_mask, max_h, max_w], dtype='int32') - 1

            id_start = 0
            for i in range(paddle.shape(bbox_num)[0]):
                bboxes_i = bboxes[id_start:id_start + bbox_num[i], :]
                mask_out_i = mask_out[id_start:id_start + bbox_num[i], :, :]
                im_h = origin_shape[i, 0]
                im_w = origin_shape[i, 1]
U
ucsk 已提交
249 250 251
                pred_mask = paste_mask(mask_out_i[:, None, :, :],
                                       bboxes_i[:, 2:], im_h, im_w,
                                       self.assign_on_cpu)
W
wangguanzhong 已提交
252 253 254 255 256
                pred_mask = paddle.cast(pred_mask >= self.binary_thresh,
                                        'int32')
                pred_result[id_start:id_start + bbox_num[i], :im_h, :
                            im_w] = pred_mask
                id_start += bbox_num[i]
257
        if self.assign_on_cpu:
258
            paddle.set_device(device)
259

260
        return pred_result
F
Feng Ni 已提交
261 262


263
@register
264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
class JDEBBoxPostProcess(nn.Layer):
    __shared__ = ['num_classes']
    __inject__ = ['decode', 'nms']

    def __init__(self, num_classes=1, decode=None, nms=None, return_idx=True):
        super(JDEBBoxPostProcess, self).__init__()
        self.num_classes = num_classes
        self.decode = decode
        self.nms = nms
        self.return_idx = return_idx

        self.fake_bbox_pred = paddle.to_tensor(
            np.array(
                [[-1, 0.0, 0.0, 0.0, 0.0, 0.0]], dtype='float32'))
        self.fake_bbox_num = paddle.to_tensor(np.array([1], dtype='int32'))
        self.fake_nms_keep_idx = paddle.to_tensor(
            np.array(
                [[0]], dtype='int32'))

        self.fake_yolo_boxes_out = paddle.to_tensor(
            np.array(
                [[[0.0, 0.0, 0.0, 0.0]]], dtype='float32'))
        self.fake_yolo_scores_out = paddle.to_tensor(
            np.array(
                [[[0.0]]], dtype='float32'))
        self.fake_boxes_idx = paddle.to_tensor(np.array([[0]], dtype='int64'))

G
George Ni 已提交
291
    def forward(self, head_out, anchors):
292 293 294 295 296 297 298 299 300 301 302 303 304 305
        """
        Decode the bbox and do NMS for JDE model. 

        Args:
            head_out (list): Bbox_pred and cls_prob of bbox_head output.
            anchors (list): Anchors of JDE model.

        Returns:
            boxes_idx (Tensor): The index of kept bboxes after decode 'JDEBox'. 
            bbox_pred (Tensor): The output is the prediction with shape [N, 6]
                including labels, scores and bboxes.
            bbox_num (Tensor): The number of prediction of each batch with shape [N].
            nms_keep_idx (Tensor): The index of kept bboxes after NMS. 
        """
306
        boxes_idx, yolo_boxes_scores = self.decode(head_out, anchors)
307

308 309 310 311 312 313 314 315 316 317 318 319 320 321
        if len(boxes_idx) == 0:
            boxes_idx = self.fake_boxes_idx
            yolo_boxes_out = self.fake_yolo_boxes_out
            yolo_scores_out = self.fake_yolo_scores_out
        else:
            yolo_boxes = paddle.gather_nd(yolo_boxes_scores, boxes_idx)
            # TODO: only support bs=1 now
            yolo_boxes_out = paddle.reshape(
                yolo_boxes[:, :4], shape=[1, len(boxes_idx), 4])
            yolo_scores_out = paddle.reshape(
                yolo_boxes[:, 4:5], shape=[1, 1, len(boxes_idx)])
            boxes_idx = boxes_idx[:, 1:]

        if self.return_idx:
G
George Ni 已提交
322 323 324 325 326 327
            bbox_pred, bbox_num, nms_keep_idx = self.nms(
                yolo_boxes_out, yolo_scores_out, self.num_classes)
            if bbox_pred.shape[0] == 0:
                bbox_pred = self.fake_bbox_pred
                bbox_num = self.fake_bbox_num
                nms_keep_idx = self.fake_nms_keep_idx
328 329
            return boxes_idx, bbox_pred, bbox_num, nms_keep_idx
        else:
G
George Ni 已提交
330 331 332 333 334 335
            bbox_pred, bbox_num, _ = self.nms(yolo_boxes_out, yolo_scores_out,
                                              self.num_classes)
            if bbox_pred.shape[0] == 0:
                bbox_pred = self.fake_bbox_pred
                bbox_num = self.fake_bbox_num
            return _, bbox_pred, bbox_num, _
F
FlyingQianMM 已提交
336 337 338


@register
339
class CenterNetPostProcess(object):
F
FlyingQianMM 已提交
340 341 342 343 344 345 346 347 348 349 350 351
    """
    Postprocess the model outputs to get final prediction:
        1. Do NMS for heatmap to get top `max_per_img` bboxes.
        2. Decode bboxes using center offset and box size.
        3. Rescale decoded bboxes reference to the origin image shape.
    Args:
        max_per_img(int): the maximum number of predicted objects in a image,
            500 by default.
        down_ratio(int): the down ratio from images to heatmap, 4 by default.
        regress_ltrb (bool): whether to regress left/top/right/bottom or
            width/height for a box, true by default.
    """
352
    __shared__ = ['down_ratio']
F
FlyingQianMM 已提交
353

354 355
    def __init__(self, max_per_img=500, down_ratio=4, regress_ltrb=True):
        super(CenterNetPostProcess, self).__init__()
F
FlyingQianMM 已提交
356 357 358
        self.max_per_img = max_per_img
        self.down_ratio = down_ratio
        self.regress_ltrb = regress_ltrb
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391
        # _simple_nms() _topk() are same as TTFBox in ppdet/modeling/layers.py

    def _simple_nms(self, heat, kernel=3):
        """ Use maxpool to filter the max score, get local peaks. """
        pad = (kernel - 1) // 2
        hmax = F.max_pool2d(heat, kernel, stride=1, padding=pad)
        keep = paddle.cast(hmax == heat, 'float32')
        return heat * keep

    def _topk(self, scores):
        """ Select top k scores and decode to get xy coordinates. """
        k = self.max_per_img
        shape_fm = paddle.shape(scores)
        shape_fm.stop_gradient = True
        cat, height, width = shape_fm[1], shape_fm[2], shape_fm[3]
        # batch size is 1
        scores_r = paddle.reshape(scores, [cat, -1])
        topk_scores, topk_inds = paddle.topk(scores_r, k)
        topk_ys = topk_inds // width
        topk_xs = topk_inds % width

        topk_score_r = paddle.reshape(topk_scores, [-1])
        topk_score, topk_ind = paddle.topk(topk_score_r, k)
        k_t = paddle.full(paddle.shape(topk_ind), k, dtype='int64')
        topk_clses = paddle.cast(paddle.floor_divide(topk_ind, k_t), 'float32')

        topk_inds = paddle.reshape(topk_inds, [-1])
        topk_ys = paddle.reshape(topk_ys, [-1, 1])
        topk_xs = paddle.reshape(topk_xs, [-1, 1])
        topk_inds = paddle.gather(topk_inds, topk_ind)
        topk_ys = paddle.gather(topk_ys, topk_ind)
        topk_xs = paddle.gather(topk_xs, topk_ind)
        return topk_score, topk_inds, topk_clses, topk_ys, topk_xs
F
FlyingQianMM 已提交
392 393

    def __call__(self, hm, wh, reg, im_shape, scale_factor):
394
        # 1.get clses and scores, note that hm had been done sigmoid
F
FlyingQianMM 已提交
395
        heat = self._simple_nms(hm)
396
        scores, inds, topk_clses, ys, xs = self._topk(heat)
F
Feng Ni 已提交
397
        clses = topk_clses.unsqueeze(1)
398
        scores = scores.unsqueeze(1)
F
FlyingQianMM 已提交
399

400
        # 2.get bboxes, note only support batch_size=1 now
F
FlyingQianMM 已提交
401
        reg_t = paddle.transpose(reg, [0, 2, 3, 1])
F
Feng Ni 已提交
402
        reg = paddle.reshape(reg_t, [-1, reg_t.shape[-1]])
F
FlyingQianMM 已提交
403 404 405 406 407 408
        reg = paddle.gather(reg, inds)
        xs = paddle.cast(xs, 'float32')
        ys = paddle.cast(ys, 'float32')
        xs = xs + reg[:, 0:1]
        ys = ys + reg[:, 1:2]
        wh_t = paddle.transpose(wh, [0, 2, 3, 1])
F
Feng Ni 已提交
409
        wh = paddle.reshape(wh_t, [-1, wh_t.shape[-1]])
F
FlyingQianMM 已提交
410 411 412 413 414 415 416 417 418 419 420
        wh = paddle.gather(wh, inds)
        if self.regress_ltrb:
            x1 = xs - wh[:, 0:1]
            y1 = ys - wh[:, 1:2]
            x2 = xs + wh[:, 2:3]
            y2 = ys + wh[:, 3:4]
        else:
            x1 = xs - wh[:, 0:1] / 2
            y1 = ys - wh[:, 1:2] / 2
            x2 = xs + wh[:, 0:1] / 2
            y2 = ys + wh[:, 1:2] / 2
421
        n, c, feat_h, feat_w = paddle.shape(hm)
F
FlyingQianMM 已提交
422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
        padw = (feat_w * self.down_ratio - im_shape[0, 1]) / 2
        padh = (feat_h * self.down_ratio - im_shape[0, 0]) / 2
        x1 = x1 * self.down_ratio
        y1 = y1 * self.down_ratio
        x2 = x2 * self.down_ratio
        y2 = y2 * self.down_ratio
        x1 = x1 - padw
        y1 = y1 - padh
        x2 = x2 - padw
        y2 = y2 - padh
        bboxes = paddle.concat([x1, y1, x2, y2], axis=1)
        scale_y = scale_factor[:, 0:1]
        scale_x = scale_factor[:, 1:2]
        scale_expand = paddle.concat(
            [scale_x, scale_y, scale_x, scale_y], axis=1)
F
Feng Ni 已提交
437
        boxes_shape = bboxes.shape[:]
F
FlyingQianMM 已提交
438 439
        scale_expand = paddle.expand(scale_expand, shape=boxes_shape)
        bboxes = paddle.divide(bboxes, scale_expand)
440

441
        results = paddle.concat([clses, scores, bboxes], axis=1)
442
        return results, paddle.shape(results)[0:1], inds, topk_clses, ys, xs
443 444 445


@register
446 447
class DETRPostProcess(object):
    __shared__ = ['num_classes', 'use_focal_loss', 'with_mask']
448 449 450 451 452
    __inject__ = []

    def __init__(self,
                 num_classes=80,
                 num_top_queries=100,
Z
Zhao-Yian 已提交
453 454
                 dual_queries=False,
                 dual_groups=0,
455 456 457 458 459
                 use_focal_loss=False,
                 with_mask=False,
                 mask_threshold=0.5,
                 use_avg_mask_score=False):
        super(DETRPostProcess, self).__init__()
460 461
        self.num_classes = num_classes
        self.num_top_queries = num_top_queries
Z
Zhao-Yian 已提交
462 463
        self.dual_queries = dual_queries
        self.dual_groups = dual_groups
464
        self.use_focal_loss = use_focal_loss
465 466 467
        self.with_mask = with_mask
        self.mask_threshold = mask_threshold
        self.use_avg_mask_score = use_avg_mask_score
468

469 470 471 472 473 474 475 476 477 478 479
    def _mask_postprocess(self, mask_pred, score_pred, index):
        mask_score = F.sigmoid(paddle.gather_nd(mask_pred, index))
        mask_pred = (mask_score > self.mask_threshold).astype(mask_score.dtype)
        if self.use_avg_mask_score:
            avg_mask_score = (mask_pred * mask_score).sum([-2, -1]) / (
                mask_pred.sum([-2, -1]) + 1e-6)
            score_pred *= avg_mask_score

        return mask_pred[0].astype('int32'), score_pred

    def __call__(self, head_out, im_shape, scale_factor, pad_shape):
480 481 482 483 484
        """
        Decode the bbox.

        Args:
            head_out (tuple): bbox_pred, cls_logit and masks of bbox_head output.
485
            im_shape (Tensor): The shape of the input image without padding.
486
            scale_factor (Tensor): The scale factor of the input image.
487
            pad_shape (Tensor): The shape of the input image with padding.
488 489 490 491 492 493 494 495
        Returns:
            bbox_pred (Tensor): The output prediction with shape [N, 6], including
                labels, scores and bboxes. The size of bboxes are corresponding
                to the input image, the bboxes may be used in other branch.
            bbox_num (Tensor): The number of prediction boxes of each batch with
                shape [bs], and is N.
        """
        bboxes, logits, masks = head_out
Z
Zhao-Yian 已提交
496 497 498 499
        if self.dual_queries:
            num_queries = logits.shape[1]
            logits, bboxes = logits[:, :int(num_queries // (self.dual_groups + 1)), :], \
                             bboxes[:, :int(num_queries // (self.dual_groups + 1)), :]
500 501

        bbox_pred = bbox_cxcywh_to_xyxy(bboxes)
502
        # calculate the original shape of the image
503
        origin_shape = paddle.floor(im_shape / scale_factor + 0.5)
504
        img_h, img_w = paddle.split(origin_shape, 2, axis=-1)
505 506 507 508
        # calculate the shape of the image with padding
        out_shape = pad_shape / im_shape * origin_shape
        out_shape = out_shape.flip(1).tile([1, 2]).unsqueeze(1)
        bbox_pred *= out_shape
509 510 511 512

        scores = F.sigmoid(logits) if self.use_focal_loss else F.softmax(
            logits)[:, :, :-1]

513 514 515 516 517
        if not self.use_focal_loss:
            scores, labels = scores.max(-1), scores.argmax(-1)
            if scores.shape[1] > self.num_top_queries:
                scores, index = paddle.topk(
                    scores, self.num_top_queries, axis=-1)
518 519 520 521 522 523
                batch_ind = paddle.arange(
                    end=scores.shape[0]).unsqueeze(-1).tile(
                        [1, self.num_top_queries])
                index = paddle.stack([batch_ind, index], axis=-1)
                labels = paddle.gather_nd(labels, index)
                bbox_pred = paddle.gather_nd(bbox_pred, index)
524 525
        else:
            scores, index = paddle.topk(
526 527 528 529 530 531 532
                scores.flatten(1), self.num_top_queries, axis=-1)
            labels = index % self.num_classes
            index = index // self.num_classes
            batch_ind = paddle.arange(end=scores.shape[0]).unsqueeze(-1).tile(
                [1, self.num_top_queries])
            index = paddle.stack([batch_ind, index], axis=-1)
            bbox_pred = paddle.gather_nd(bbox_pred, index)
533

534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552
        mask_pred = None
        if self.with_mask:
            assert masks is not None
            masks = F.interpolate(
                masks, scale_factor=4, mode="bilinear", align_corners=False)
            # TODO: Support prediction with bs>1.
            # remove padding for input image
            h, w = im_shape.astype('int32')[0]
            masks = masks[..., :h, :w]
            # get pred_mask in the original resolution.
            img_h = img_h[0].astype('int32')
            img_w = img_w[0].astype('int32')
            masks = F.interpolate(
                masks,
                size=(img_h, img_w),
                mode="bilinear",
                align_corners=False)
            mask_pred, scores = self._mask_postprocess(masks, scores, index)

553 554 555 556 557 558 559
        bbox_pred = paddle.concat(
            [
                labels.unsqueeze(-1).astype('float32'), scores.unsqueeze(-1),
                bbox_pred
            ],
            axis=-1)
        bbox_num = paddle.to_tensor(
560
            self.num_top_queries, dtype='int32').tile([bbox_pred.shape[0]])
561
        bbox_pred = bbox_pred.reshape([-1, 6])
562
        return bbox_pred, bbox_num, mask_pred
F
FL77N 已提交
563 564 565 566


@register
class SparsePostProcess(object):
U
ucsk 已提交
567
    __shared__ = ['num_classes', 'assign_on_cpu']
F
FL77N 已提交
568

U
ucsk 已提交
569 570 571 572 573
    def __init__(self,
                 num_proposals,
                 num_classes=80,
                 binary_thresh=0.5,
                 assign_on_cpu=False):
F
FL77N 已提交
574 575 576
        super(SparsePostProcess, self).__init__()
        self.num_classes = num_classes
        self.num_proposals = num_proposals
U
ucsk 已提交
577 578
        self.binary_thresh = binary_thresh
        self.assign_on_cpu = assign_on_cpu
F
FL77N 已提交
579

U
ucsk 已提交
580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616
    def __call__(self, scores, bboxes, scale_factor, ori_shape, masks=None):
        assert len(scores) == len(bboxes) == \
               len(ori_shape) == len(scale_factor)
        device = paddle.device.get_device()
        batch_size = len(ori_shape)

        scores = F.sigmoid(scores)
        has_mask = masks is not None
        if has_mask:
            masks = F.sigmoid(masks)
            masks = masks.reshape([batch_size, -1, *masks.shape[1:]])

        bbox_pred = []
        mask_pred = [] if has_mask else None
        bbox_num = paddle.zeros([batch_size], dtype='int32')
        for i in range(batch_size):
            score = scores[i]
            bbox = bboxes[i]
            score, indices = score.flatten(0, 1).topk(
                self.num_proposals, sorted=False)
            label = indices % self.num_classes
            if has_mask:
                mask = masks[i]
                mask = mask.flatten(0, 1)[indices]

            H, W = ori_shape[i][0], ori_shape[i][1]
            bbox = bbox[paddle.cast(indices / self.num_classes, indices.dtype)]
            bbox /= scale_factor[i]
            bbox[:, 0::2] = paddle.clip(bbox[:, 0::2], 0, W)
            bbox[:, 1::2] = paddle.clip(bbox[:, 1::2], 0, H)

            keep = ((bbox[:, 2] - bbox[:, 0]).numpy() > 1.) & \
                   ((bbox[:, 3] - bbox[:, 1]).numpy() > 1.)
            if keep.sum() == 0:
                bbox = paddle.zeros([1, 6], dtype='float32')
                if has_mask:
                    mask = paddle.zeros([1, H, W], dtype='uint8')
F
FL77N 已提交
617
            else:
U
ucsk 已提交
618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636
                label = paddle.to_tensor(label.numpy()[keep]).astype(
                    'float32').unsqueeze(-1)
                score = paddle.to_tensor(score.numpy()[keep]).astype(
                    'float32').unsqueeze(-1)
                bbox = paddle.to_tensor(bbox.numpy()[keep]).astype('float32')
                if has_mask:
                    mask = paddle.to_tensor(mask.numpy()[keep]).astype(
                        'float32').unsqueeze(1)
                    mask = paste_mask(mask, bbox, H, W, self.assign_on_cpu)
                    mask = paddle.cast(mask >= self.binary_thresh, 'uint8')
                bbox = paddle.concat([label, score, bbox], axis=-1)

            bbox_num[i] = bbox.shape[0]
            bbox_pred.append(bbox)
            if has_mask:
                mask_pred.append(mask)

        bbox_pred = paddle.concat(bbox_pred)
        mask_pred = paddle.concat(mask_pred) if has_mask else None
F
FL77N 已提交
637

U
ucsk 已提交
638 639
        if self.assign_on_cpu:
            paddle.set_device(device)
F
FL77N 已提交
640

U
ucsk 已提交
641 642 643 644
        if has_mask:
            return bbox_pred, bbox_num, mask_pred
        else:
            return bbox_pred, bbox_num
F
FL77N 已提交
645

U
ucsk 已提交
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670

def paste_mask(masks, boxes, im_h, im_w, assign_on_cpu=False):
    """
    Paste the mask prediction to the original image.
    """
    x0_int, y0_int = 0, 0
    x1_int, y1_int = im_w, im_h
    x0, y0, x1, y1 = paddle.split(boxes, 4, axis=1)
    N = masks.shape[0]
    img_y = paddle.arange(y0_int, y1_int) + 0.5
    img_x = paddle.arange(x0_int, x1_int) + 0.5

    img_y = (img_y - y0) / (y1 - y0) * 2 - 1
    img_x = (img_x - x0) / (x1 - x0) * 2 - 1
    # img_x, img_y have shapes (N, w), (N, h)

    if assign_on_cpu:
        paddle.set_device('cpu')
    gx = img_x[:, None, :].expand(
        [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
    gy = img_y[:, :, None].expand(
        [N, paddle.shape(img_y)[1], paddle.shape(img_x)[1]])
    grid = paddle.stack([gx, gy], axis=3)
    img_masks = F.grid_sample(masks, grid, align_corners=False)
    return img_masks[:, 0]
M
Mark Ma 已提交
671 672


673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689
def multiclass_nms(bboxs, num_classes, match_threshold=0.6, match_metric='iou'):
    final_boxes = []
    for c in range(num_classes):
        idxs = bboxs[:, 0] == c
        if np.count_nonzero(idxs) == 0: continue
        r = nms(bboxs[idxs, 1:], match_threshold, match_metric)
        final_boxes.append(np.concatenate([np.full((r.shape[0], 1), c), r], 1))
    return final_boxes


def nms(dets, match_threshold=0.6, match_metric='iou'):
    """ Apply NMS to avoid detecting too many overlapping bounding boxes.
        Args:
            dets: shape [N, 5], [score, x1, y1, x2, y2]
            match_metric: 'iou' or 'ios'
            match_threshold: overlap thresh for match metric.
    """
M
Mark Ma 已提交
690 691 692 693 694 695 696 697 698 699 700
    if dets.shape[0] == 0:
        return dets[[], :]
    scores = dets[:, 0]
    x1 = dets[:, 1]
    y1 = dets[:, 2]
    x2 = dets[:, 3]
    y2 = dets[:, 4]
    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
    order = scores.argsort()[::-1]

    ndets = dets.shape[0]
W
wangguanzhong 已提交
701
    suppressed = np.zeros((ndets), dtype=np.int32)
M
Mark Ma 已提交
702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722

    for _i in range(ndets):
        i = order[_i]
        if suppressed[i] == 1:
            continue
        ix1 = x1[i]
        iy1 = y1[i]
        ix2 = x2[i]
        iy2 = y2[i]
        iarea = areas[i]
        for _j in range(_i + 1, ndets):
            j = order[_j]
            if suppressed[j] == 1:
                continue
            xx1 = max(ix1, x1[j])
            yy1 = max(iy1, y1[j])
            xx2 = min(ix2, x2[j])
            yy2 = min(iy2, y2[j])
            w = max(0.0, xx2 - xx1 + 1)
            h = max(0.0, yy2 - yy1 + 1)
            inter = w * h
723 724 725 726 727 728 729 730 731
            if match_metric == 'iou':
                union = iarea + areas[j] - inter
                match_value = inter / union
            elif match_metric == 'ios':
                smaller = min(iarea, areas[j])
                match_value = inter / smaller
            else:
                raise ValueError()
            if match_value >= match_threshold:
M
Mark Ma 已提交
732 733 734 735
                suppressed[j] = 1
    keep = np.where(suppressed == 0)[0]
    dets = dets[keep, :]
    return dets