#pragma once #include #include #include #define DIVUP(m, n) (((m)-1) / (n) + 1) namespace mgb { namespace opr { namespace standalone { namespace nms { /*! * \brief launch the kernel to generate nms mask * \param nr_boxes number of input boxes * \param nms_overlap_thresh overlapping threshold for IoU * \param[in] dev_boxes boxes in [n, 4] layout, * each row containing (x0, y0, x1, y1) * \param dev_mask_width width in number of uint64_t elements of div_mask * matrix; must be at least ceil(n, 64) * \param[out] dev_mask [n, dev_mask_width] dev_mask[i] is a * bitmask of length n indicating whether i overlaps with each box. Only * the upper triangle (row < col) are filled. */ void launch_gen_mask(const int nr_boxes, const float nms_overlap_thresh, const float* dev_boxes, const int dev_mask_width, uint64_t* dev_mask, cudaStream_t stream); /*! * \brief launch the kernel to generate indices of kept boxes * \param max_output max number of entries to be written to out_idx * \param overlap_mask the mask generated by launch_gen_mask * \param[in,out] rm_mask mask of removed boxes; must be initialized as 0 * \param[out] out_idx indices of boxes to be kept * \param[out] out_size number of items written to out_idx; the remaining items * would be filled with the last valid item */ void launch_gen_indices(int nr_boxes, int max_output, int overlap_mask_width, const uint64_t* overlap_mask, uint64_t* rm_mask, uint32_t* out_idx, uint32_t* out_size, cudaStream_t stream); } // namespace nms } // namespace standalone } // namespace opr } // namespace mgb // vim: ft=cuda syntax=cuda.doxygen