#include "megbrain/opr/standalone/nms_opr.h" #if MGB_CUDA #include "./nms_kern.cuh" #endif #include "./nms_cpu.h" #include "megbrain/comp_node_env.h" #include "megbrain/serialization/sereg.h" #include "megbrain/utils/arith_helper.h" // for get_aligned_power2 #if MGB_ENABLE_FBS_SERIALIZATION #include "megbrain/serialization/internal/mgb_cpp_opr_generated.h" #include "megbrain/serialization/internal/schema_generated.h" #endif using namespace mgb::opr::standalone; MGB_DYN_TYPE_OBJ_FINAL_IMPL(NMSKeep); class NMSKeep::Kern { public: virtual ~Kern() = default; //! get workspace size in bytes virtual size_t get_workspace_size(const NMSKeep* opr, const TensorShape& boxes) = 0; virtual void exec(const NMSKeep* opr, const DeviceTensorND& inp, const DeviceTensorND& out_idx, const DeviceTensorND& out_size, const DeviceTensorND& workspace) = 0; }; // f{{{ cuda kernel begins #if MGB_CUDA class NMSKeep::CUDAKern final : public Kern { size_t m_workspace_overlap_mask_bytes, m_workspace_overlap_mask_bytes_align, m_workspace_rm_mask_bytes; void init(const NMSKeep* opr, const TensorShape& boxes) { auto align = opr->comp_node().get_mem_addr_alignment(); size_t nr_boxes = boxes[1]; if (nr_boxes == 0) { m_workspace_overlap_mask_bytes = 0; m_workspace_overlap_mask_bytes_align = 0; m_workspace_rm_mask_bytes = 0; } else { m_workspace_overlap_mask_bytes = nr_boxes * DIVUP(nr_boxes, 64) * sizeof(uint64_t); m_workspace_overlap_mask_bytes_align = get_aligned_power2(m_workspace_overlap_mask_bytes, align); m_workspace_rm_mask_bytes = DIVUP(nr_boxes, 64) * sizeof(uint64_t); } } public: size_t get_workspace_size(const NMSKeep* opr, const TensorShape& boxes) override { init(opr, boxes); return m_workspace_overlap_mask_bytes_align + m_workspace_rm_mask_bytes; } void exec(const NMSKeep* opr, const DeviceTensorND& inp, const DeviceTensorND& out_idx, const DeviceTensorND& out_size, const DeviceTensorND& workspace) override; }; void NMSKeep::CUDAKern::exec(const NMSKeep* opr, const DeviceTensorND& inp, const DeviceTensorND& out_idx, const DeviceTensorND& out_size, const DeviceTensorND& workspace) { // NOTE: input comp node might be different from output comp node (for // example, CUDA stream may be modified to overlap computations); a // SingleCNOperatorNodeBase is expected to execute on a single comp node, // and the comp node is defined as the output comp node CompNode comp_node = out_idx.comp_node(); // comp ndoe is also accessible from SingleCNOperatorNode mgb_assert(comp_node == opr->comp_node()); // CompNodeEnv contains platform-specific properties of a CompNode auto&& cuda_env = CompNodeEnv::from_comp_node(comp_node).cuda_env(); mgb_assert(cuda_env.device_prop.warpSize == 32, "invalid warp size: %d", cuda_env.device_prop.warpSize); auto stream = cuda_env.stream; init(opr, inp.shape()); auto inp_ptr = inp.ptr(); void* workspace_ptr = workspace.raw_ptr(); auto dev_overlap_mask = reinterpret_cast(workspace_ptr), dev_rm_mask = (uint64_t*)( workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align); auto out_idx_ptr = reinterpret_cast(out_idx.ptr()), out_size_ptr = reinterpret_cast(out_size.ptr()); size_t batch = inp.shape(0), nr_boxes = inp.shape(1); if (nr_boxes == 0) { MGB_CUDA_CHECK(cudaMemsetAsync(out_size_ptr, 0, batch*sizeof(uint32_t), stream)); return; } MGB_CUDA_CHECK(cudaMemsetAsync(dev_overlap_mask, 0, m_workspace_overlap_mask_bytes, stream)); auto max_output = opr->param().max_output; for (size_t i = 0; i < batch; ++i) { nms::launch_gen_mask(nr_boxes, opr->param().iou_thresh, inp_ptr + i * nr_boxes * 4, DIVUP(nr_boxes, 64), dev_overlap_mask, stream); MGB_CUDA_CHECK(cudaMemsetAsync(dev_rm_mask, 0, m_workspace_rm_mask_bytes, stream)); nms::launch_gen_indices(nr_boxes, max_output, DIVUP(nr_boxes, 64), dev_overlap_mask, dev_rm_mask, out_idx_ptr + i * max_output, out_size_ptr + i, stream); } } #endif // MGB_CUDA for CUDAKern // f}}} cuda kernel ends // f{{{ cpu kernel begins class NMSKeep::CPUKern final : public Kern { public: ~CPUKern() = default; size_t get_workspace_size(const NMSKeep*, const TensorShape& boxes) override { return nms::cpu_kern_workspace(boxes.shape[1]); } void exec(const NMSKeep* opr, const DeviceTensorND& inp, const DeviceTensorND& out_idx, const DeviceTensorND& out_size, const DeviceTensorND& workspace) override; }; void NMSKeep::CPUKern::exec(const NMSKeep* opr, const DeviceTensorND& inp, const DeviceTensorND& out_idx, const DeviceTensorND& out_size, const DeviceTensorND& workspace) { // See CUDAKern::exec for more explanation on output comp nodes. CompNode comp_node = out_idx.comp_node(); auto inp_ptr = inp.ptr(); auto out_idx_ptr = reinterpret_cast(out_idx.ptr()), out_size_ptr = reinterpret_cast(out_size.ptr()); size_t batch = inp.shape(0), nr_boxes = inp.shape(1); if (nr_boxes == 0) { for (size_t i = 0; i < batch; ++i) { *(out_size_ptr + i) = 0; } return; } auto param = opr->param(); auto workspace_ptr = workspace.raw_ptr(); // NOTE: we must copy all the params into the kernel closure since it would // be dispatched on a different thread auto kern = [=]() { for (size_t i = 0; i < batch; ++i) { nms::cpu_kern(nr_boxes, param.max_output, param.iou_thresh, inp_ptr + i * nr_boxes * 4, out_idx_ptr + i * param.max_output, out_size_ptr + i, workspace_ptr); } }; // The kernel should not be invoked CompNodeEnv::from_comp_node(comp_node).cpu_env().dispatch(kern); } // f}}} cpu kernel ends NMSKeep::NMSKeep(VarNode* boxes, const Param& param, const OperatorNodeConfig& config) : Super(boxes->owner_graph(), // owner graph config, // OperatorNodeConfig "nms_keep", // opr type name (used for generating opr name) {boxes} // input vars for generating opr name ), m_param{param} { mgb_assert(boxes->dtype() == dtype::Float32(), "input should be float32; got %s", boxes->dtype().name()); // setup m_kern according to device type switch (boxes->comp_node().device_type()) { #if MGB_CUDA case CompNode::DeviceType::CUDA: m_kern = std::make_unique(); break; #endif case CompNode::DeviceType::CPU: m_kern = std::make_unique(); break; default: mgb_throw(MegBrainError, "NMSKeep: unsupported device type: %s", boxes->comp_node().to_string().c_str()); } add_input({boxes}); add_output("indices")->dtype(dtype::Int32()) .add_flag(VarNode::Flag::ALLOW_EMPTY_SHAPE); add_output("sizes")->dtype(dtype::Int32()); cg::add_workspace_output(this); // workspace is also an output var // make the graph deduplication system consider m_param (so two oprs with // same input vars but different param values would not be deduplicated) add_equivalence_component>(&m_param); } // impl dtor after Kern is defined NMSKeep::~NMSKeep() noexcept = default; mgb::SymbolVar NMSKeep::make(SymbolVar boxes, const Param& param, const OperatorNodeConfig& config) { // SymbolVar is just a wrapper of VarNode*, with overloaded methods such as // operator+() auto bvar = boxes.node(); // insert opr into the owner graph of boxes return boxes.insert_single_output_opr(bvar, param, config); } void NMSKeep::get_output_var_shape(const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const { auto boxes = inp_shape.at(0); mgb_assert(boxes.ndim == 3 && boxes.shape[2] == 4, "invalid box shape: %s", boxes.to_string().c_str()); // out_shape should match the outputs added in the constructor mgb_assert(out_shape.size() == 3); auto batch = boxes[0]; out_shape[0] = {batch, m_param.max_output}; // indices out_shape[1] = {batch}; // sizes out_shape[2] = {m_kern->get_workspace_size(this, boxes)}; // workspace } void NMSKeep::add_input_layout_constraint() { input(0)->add_layout_constraint_contiguous(); } void NMSKeep::scn_do_execute() { DeviceTensorND empty_workspace; m_kern->exec(this, input(0)->dev_tensor(), output(0)->dev_tensor(), output(1)->dev_tensor(), // if workspace size is 0, output(2) would be invalid and its // dev_tensor() can not be accessed output(2)->dev_tensor_valid() ? output(2)->dev_tensor() : empty_workspace); } NMSKeep::NodeProp* NMSKeep::do_make_node_prop() const { auto ret = Super::do_make_node_prop(); ret->add_dep_type_existing_var(input(0), NodeProp::DepType::VALUE_ALLOW_EMPTY); return ret; } #if MGB_ENABLE_FBS_SERIALIZATION namespace mgb { namespace serialization { namespace fbs { template <> struct ParamConverter { using FlatBufferType = param::NMSKeep; static opr::standalone::NMSKeep::Param to_param(const FlatBufferType* fb) { return {fb->iou_thresh(), fb->max_output()}; } static flatbuffers::Offset to_flatbuffer( flatbuffers::FlatBufferBuilder& builder, const opr::standalone::NMSKeep::Param& p) { return param::CreateNMSKeep(builder, p.iou_thresh, p.max_output); } }; } // namespace fbs } // namespace serialization } // namespace mgb #endif namespace mgb { void _hack_pull_in_nms_opr_object() {} } // namespace mgb // register serialization: the default implementation uses Opr::Param; it // requires Param::TAG, Opr::param() and Opr::make(..., param) to exist // Note: the second param 1 here means that this operator has one input using NMSKeepMGB = NMSKeep; MGB_SEREG_OPR(NMSKeepMGB, 1); // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}