gpu_nms.py

#!/usr/bin/env mdl
# This file will seal the nms opr within a better way than lib_nms
import ctypes
import os
import struct

import numpy as np

import megengine as mge
import megengine.functional as F
from megengine._internal.craniotome import CraniotomeBase
from megengine.core.tensor import wrap_io_tensor

_current_path = os.path.dirname(os.path.abspath(__file__))
_so_path = os.path.join(_current_path, "lib_nms.so")
try:
    _so_lib = ctypes.CDLL(_so_path)
except Exception:
    import subprocess
    mge_path = os.path.join(os.path.dirname(mge.__file__), "_internal", "include")
    assert os.path.exists(mge_path), "{} file not found".format(mge_path)
    src_file = os.path.join(_current_path, "gpu_nms", "nms.cu")
    assert os.path.exists(src_file), "{} file not found".format(src_file)

    cmd = (
        "nvcc -I {} -shared -o {} -Xcompiler '-fno-strict-aliasing -fPIC' {}".format(
            mge_path, _so_path, src_file
        )
    )
    subprocess.check_call(cmd, shell=True)
    _so_lib = ctypes.CDLL(_so_path)

_TYPE_POINTER = ctypes.c_void_p
_TYPE_POINTER = ctypes.c_void_p
_TYPE_INT = ctypes.c_int32
_TYPE_FLOAT = ctypes.c_float

_so_lib.NMSForwardGpu.argtypes = [
    _TYPE_POINTER,
    _TYPE_POINTER,
    _TYPE_POINTER,
    _TYPE_POINTER,
    _TYPE_FLOAT,
    _TYPE_INT,
    _TYPE_POINTER,
]
_so_lib.NMSForwardGpu.restype = _TYPE_INT

_so_lib.CreateHostDevice.restype = _TYPE_POINTER


class NMSCran(CraniotomeBase):
    __nr_inputs__ = 1
    __nr_outputs__ = 3

    def setup(self, iou_threshold, max_output):
        self._iou_threshold = iou_threshold
        self._max_output = max_output
        # Load the necessary host device
        self._host_device = _so_lib.CreateHostDevice()

    def execute(self, inputs, outputs):
        box_tensor_ptr = inputs[0].pubapi_dev_tensor_ptr
        output_tensor_ptr = outputs[0].pubapi_dev_tensor_ptr
        output_num_tensor_ptr = outputs[1].pubapi_dev_tensor_ptr
        mask_tensor_ptr = outputs[2].pubapi_dev_tensor_ptr

        _so_lib.NMSForwardGpu(
            box_tensor_ptr,
            mask_tensor_ptr,
            output_tensor_ptr,
            output_num_tensor_ptr,
            self._iou_threshold,
            self._max_output,
            self._host_device,
        )

    def grad(self, wrt_idx, inputs, outputs, out_grad):
        return 0

    def init_output_dtype(self, input_dtypes):
        return [np.int32, np.int32, np.int32]

    def get_serialize_params(self):
        return ("nms", struct.pack("fi", self._iou_threshold, self._max_output))

    def infer_shape(self, inp_shapes):
        nr_box = inp_shapes[0][0]
        threadsPerBlock = 64
        output_size = nr_box
        # here we compute the number of int32 used in mask_outputs.
        # In original version, we compute the bytes only.
        mask_size = int(
            nr_box
            * (nr_box // threadsPerBlock + int((nr_box % threadsPerBlock) > 0))
            * 8
            / 4
        )
        return [[output_size], [1], [mask_size]]


@wrap_io_tensor
def gpu_nms(box, iou_threshold, max_output):
    keep, num, _ = NMSCran.make(box, iou_threshold=iou_threshold, max_output=max_output)
    return keep[:num]


def batched_nms(boxes, scores, idxs, iou_threshold, num_keep, use_offset=False):
    if use_offset:
        boxes_offset = (
            mge.tensor([0, 0, 1, 1], device=boxes.device)
            .reshape(1, 4)
            .broadcast(boxes.shapeof(0), 4)
        )
        boxes = boxes - boxes_offset
    max_coordinate = boxes.max()
    offsets = idxs * (max_coordinate + 1)
    boxes_for_nms = boxes + offsets.reshape(-1, 1).broadcast(boxes.shapeof(0), 4)
    boxes_with_scores = F.concat([boxes_for_nms, scores.reshape(-1, 1)], axis=1)
    keep_inds = gpu_nms(boxes_with_scores, iou_threshold, num_keep)
    return keep_inds