#pragma once

#include <cuda_runtime.h>
#include <stddef.h>
#include <stdint.h>

#define DIVUP(m, n) (((m)-1) / (n) + 1)

namespace mgb {
namespace opr {
namespace standalone {
namespace nms {

/*!
 * \brief launch the kernel to generate nms mask
 * \param nr_boxes number of input boxes
 * \param nms_overlap_thresh overlapping threshold for IoU
 * \param[in] dev_boxes boxes in [n, 4] layout,
 *      each row containing (x0, y0, x1, y1)
 * \param dev_mask_width width in number of uint64_t elements of div_mask
 *      matrix; must be at least ceil(n, 64)
 * \param[out] dev_mask [n, dev_mask_width] dev_mask[i] is a
 *      bitmask of length n indicating whether i overlaps with each box. Only
 *      the upper triangle (row < col) are filled.
 */
void launch_gen_mask(const int nr_boxes, const float nms_overlap_thresh,
                     const float* dev_boxes, const int dev_mask_width,
                     uint64_t* dev_mask, cudaStream_t stream);

/*!
 * \brief launch the kernel to generate indices of kept boxes
 * \param max_output max number of entries to be written to out_idx
 * \param overlap_mask the mask generated by launch_gen_mask
 * \param[in,out] rm_mask mask of removed boxes; must be initialized as 0
 * \param[out] out_idx indices of boxes to be kept
 * \param[out] out_size number of items written to out_idx; the remaining items
 *      would be filled with the last valid item
 */
void launch_gen_indices(int nr_boxes, int max_output, int overlap_mask_width,
                        const uint64_t* overlap_mask, uint64_t* rm_mask,
                        uint32_t* out_idx, uint32_t* out_size,
                        cudaStream_t stream);

}  // namespace nms
}  // namespace standalone
}  // namespace opr
}  // namespace mgb

// vim: ft=cuda syntax=cuda.doxygen