/** * \file src/opr/impl/mc20_runtime_op.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "megbrain/opr/mc20_runtime_op.h" #include "megbrain/common.h" #include "megbrain/graph/event.h" #include "megdnn/dtype.h" #include #if MGB_MC20 using namespace mgb; using namespace opr; namespace { TensorShape mc20_shape_to_mgb_shape(AX_NPU_SDK_EX_TENSOR_META_T tensor_meta) { TensorShape ret; ret.ndim = tensor_meta.nShapeNDim; for (size_t i = 0; i < ret.ndim; ++i) { ret[i] = tensor_meta.pShape[i]; } return ret; } DType mc20_dtype_to_mgb_dtype(AX_NPU_SDK_EX_ADV_TENSOR_DTYPE data_type) { switch (data_type) { case AX_NPU_TDT_UINT8: return dtype::Uint8(); case AX_NPU_TDT_FLOAT32: return dtype::Float32(); case AX_NPU_TDT_INT16: return dtype::Int16(); case AX_NPU_TDT_INT32: return dtype::Int32(); default: mgb_throw( MegBrainError, "MC20DataType %d is not supported by MegBrain.", static_cast(data_type)); } } }; // namespace constexpr AX_NPU_SDK_EX_HANDLE_T MC20RuntimeOpr::INVALID_MODEL_HANDLE; /* ====================== MC20RuntimeOpr ==================== */ MGB_DYN_TYPE_OBJ_FINAL_IMPL(MC20RuntimeOpr); MC20RuntimeOpr::MC20RuntimeOpr( SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle, const VarNodeArray& inputs, const OperatorNodeConfig& config) : Super(inputs[0]->owner_graph(), config, "mc20_runtime", inputs), m_buffer{std::move(buf)}, m_model_handle(model_handle) { mgb_assert( inputs[0]->comp_node().device_type() == CompNode::DeviceType::MC20, "MC20RuntimeOpr can only be used on mc20 comp node; " "got %s", inputs[0]->comp_node().to_string().c_str()); for (auto i : inputs) { add_input({i}); } if (m_model_handle == INVALID_MODEL_HANDLE) { MGB_MC20_CHECK(AX_NPU_SDK_EX_Create_handle( &m_model_handle, m_buffer.data(), m_buffer.size())); m_is_model_holder = true; } const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info = AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle); size_t nr_outputs = io_info->nOutputSize; bool has_workspace = false; if (nr_outputs == 1) { const auto& tensor_meta = *(io_info->pOutputs[0].pTensorMeta); add_output(std::string(reinterpret_cast(tensor_meta.pName))); if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) { mgb_assert(tensor_meta.nInnerSize > 0); has_workspace = true; } } else { for (size_t i = 0; i < nr_outputs; ++i) { const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta); add_output(std::string(reinterpret_cast(tensor_meta.pName))); if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) { mgb_assert(tensor_meta.nInnerSize > 0); has_workspace = true; } } } mgb_assert(has_workspace, "Currently only support model with cpu tail"); //! \warning There is no interface in MC20 to get the batch size of //! model.MC20 supports multi-batch by changing the input of n-batch to n //! 1-batch input. mgb_assert( io_info->nInputSize % inputs.size() == 0, "The number of inputs in the neu model should be multiple of " "the number of inputs in megbrain, but got %zu(neu model) vs " "%zu(mgb model)", io_info->nInputSize, inputs.size()); m_model_batch = reinterpret_cast(io_info->nInputSize / inputs.size()); add_equivalence_component>(m_buffer.data()); cg::add_workspace_output(this); }; MC20RuntimeOpr::~MC20RuntimeOpr() { if (m_is_model_holder) { MGB_MC20_CHECK(AX_NPU_SDK_EX_Destroy_handle(m_model_handle)); } } void MC20RuntimeOpr::execute_mc20() { auto&& mc20_env = CompNodeEnv::from_comp_node(input(0)->comp_node()).mc20_env(); mc20_env.activate(); const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info = AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle); AX_NPU_SDK_EX_IO_T npu_io; memset(&npu_io, 0, sizeof(npu_io)); size_t batch_size = input(0)->dev_tensor().layout().shape[0]; for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx += m_model_batch) { //! prepare input npu_io.nInputSize = io_info->nInputSize; auto inputs = std::make_unique(npu_io.nInputSize); npu_io.pInputs = inputs.get(); for (size_t i = 0; i < npu_io.nInputSize; i++) { // get input addr info size_t inp_idx = reinterpret_cast(i / m_model_batch); AX_VOID* p_virtual_addr = input(inp_idx)->dev_tensor().raw_ptr(); AX_U64 phy_addr = MC20MemoryManager::Instance().get_phyaddr(p_virtual_addr); auto nr_bytes_per_batch = input(inp_idx)->layout().span().dist_byte() / batch_size; // add batch offset p_virtual_addr = reinterpret_cast( reinterpret_cast(p_virtual_addr) + nr_bytes_per_batch * (batch_idx + i % m_model_batch)); phy_addr += nr_bytes_per_batch * (batch_idx + i % m_model_batch); MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer( phy_addr, p_virtual_addr, nr_bytes_per_batch, phy_addr, p_virtual_addr, nr_bytes_per_batch, &npu_io.pInputs[i])); } //! prepare output npu_io.nOutputSize = io_info->nOutputSize; auto outputs = std::make_unique(npu_io.nOutputSize); npu_io.pOutputs = outputs.get(); AX_U32 offset = 0; AX_VOID* inner_virtual_addr_start = nullptr; AX_U64 inner_phy_addr_start = 0; // get innder addr form workspace inner_virtual_addr_start = output(npu_io.nOutputSize)->dev_tensor().raw_ptr(); inner_phy_addr_start = MC20MemoryManager::Instance().get_phyaddr(inner_virtual_addr_start); for (size_t i = 0; i < npu_io.nOutputSize; i++) { // get output addr info AX_VOID* p_virtual_addr = output(i)->dev_tensor().raw_ptr(); AX_U64 phy_addr = 0; auto nr_bytes_per_batch = output(i)->layout().span().dist_byte() / batch_size; // add batch offset p_virtual_addr = reinterpret_cast( reinterpret_cast(p_virtual_addr) + nr_bytes_per_batch * batch_idx); phy_addr += nr_bytes_per_batch * batch_idx; const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta); if (tensor_meta.eMemoryType == AX_NPU_MT_PHYSICAL) { MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer( phy_addr, p_virtual_addr, nr_bytes_per_batch, phy_addr, p_virtual_addr, nr_bytes_per_batch, &npu_io.pOutputs[i])); } else if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) { auto p_inner_virtual_addr = reinterpret_cast( reinterpret_cast(inner_virtual_addr_start) + offset); auto innerphy_addr = inner_phy_addr_start + offset; MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer( phy_addr, p_virtual_addr, nr_bytes_per_batch, innerphy_addr, p_inner_virtual_addr, tensor_meta.nInnerSize, &npu_io.pOutputs[i])); offset += tensor_meta.nInnerSize; } } MGB_MC20_CHECK(AX_NPU_SDK_EX_Run_task_sync(m_model_handle, &npu_io)); } } void MC20RuntimeOpr::init_output_comp_node() { //! set output to cpu compnode if has cpu tail const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info = AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle); CompNode input_cn; for (auto&& i : input()) { if (!input_cn.valid()) { input_cn = i->comp_node(); } else { mgb_assert( input_cn.mem_node() == i->comp_node().mem_node(), "opr %s{%s} requires all input to be on the same memory " "node expect=%s cur_var=%s cur_cn=%s", this->cname(), this->dyn_typeinfo()->name, input_cn.to_string().c_str(), i->cname(), i->comp_node().to_string().c_str()); } } for (size_t i = 0; i < io_info->nOutputSize; i++) { //! compnode of the var should be default_cpu as the output will be //! proxy to user output(i)->comp_node(CompNode::default_cpu()); } //! the last output is workspace, which should be the same as input output(io_info->nOutputSize)->comp_node(input_cn); } MC20RuntimeOpr::NodeProp* MC20RuntimeOpr::do_make_node_prop() const { auto ret = Super::do_make_node_prop(); ret->add_flag(NodeProp::Flag::CROSS_COMP_NODE_MEMORY); return ret; } void MC20RuntimeOpr::do_execute(ExecEnv& env) { CompNode cn = output(0)->comp_node(); auto runner = [this, cn]() { this->owner_graph()->event().signal_inplace(this, cn); cn.activate(); execute_mc20(); this->owner_graph()->event().signal_inplace(this, cn); }; env.dispatch_on_comp_node(cn, runner); // Send BeforeKernel/AfterKernel event on every different comp_node ThinHashSet st = cg::get_opr_comp_node_set(this); for (auto cn : st) { auto send_event = [this, cn]() { this->owner_graph()->event().signal_inplace( this, cn); this->owner_graph()->event().signal_inplace( this, cn); }; env.dispatch_on_comp_node(cn, send_event); } } void MC20RuntimeOpr::on_output_comp_node_stream_changed() { mgb_throw(SystemError, "comp node of output should not change"); } void MC20RuntimeOpr::get_output_var_shape( const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const { const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info = AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle); size_t nr_inputs = io_info->nInputSize; for (size_t i = 0; i < nr_inputs; ++i) { const auto& tensor_meta = *(io_info->pInputs[i].pTensorMeta); auto model_shape = mc20_shape_to_mgb_shape(tensor_meta); size_t inp_idx = reinterpret_cast(i / m_model_batch); // enable mutibatch mgb_assert( inp_shape[inp_idx][0] % model_shape[0] == 0 && (inp_shape[inp_idx][0] / model_shape[0]) % m_model_batch == 0, "input %zu batch is %zu, while model's input batch is %zu", i, inp_shape[inp_idx][0], model_shape[0]); model_shape[0] = inp_shape[inp_idx][0]; mgb_assert( model_shape.eq_shape(inp_shape[inp_idx]), "shape mismatch of input %zu, expected: %s got: %s", i, model_shape.to_string().c_str(), inp_shape[inp_idx].to_string().c_str()); } size_t input_batch = (io_info->pInputs[0].pTensorMeta)->pShape[0]; //! \warning mc20 sdk implement multi-batch by breaking an n-batch input up //! into n 1-batch inputs mgb_assert(input_batch == 1, "input batch: %d, net's input batch: 1", input_batch); AX_U32 workspace_size = 0; for (size_t i = 0; i < io_info->nOutputSize; ++i) { const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta); out_shape[i] = mc20_shape_to_mgb_shape(tensor_meta); // enable mutibatch out_shape[i][0] = out_shape[i][0] * inp_shape[0][0] / input_batch / m_model_batch; if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) { workspace_size += tensor_meta.nInnerSize; } } out_shape.back() = {workspace_size}; } void MC20RuntimeOpr::add_input_layout_constraint() { //! default contiguous for (auto i : input()) { i->add_layout_constraint_contiguous(); } } void MC20RuntimeOpr::init_output_dtype() { DType dt_mc20, dt_input; const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info = AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle); for (size_t i = 0; i < io_info->nInputSize; ++i) { dt_mc20 = mc20_dtype_to_mgb_dtype(io_info->pInputs[i].eDType); size_t inp_idx = reinterpret_cast(i / m_model_batch); dt_input = input(inp_idx)->dtype(); mgb_assert( dt_mc20.valid() && dt_input.valid() && dt_mc20.enumv() == dt_input.enumv(), "dtype mismatch of input %zu: expected %s, " "got %s", i, dt_mc20.name(), dt_input.name()); } for (size_t i = 0; i < io_info->nOutputSize; ++i) { dt_mc20 = mc20_dtype_to_mgb_dtype(io_info->pOutputs[i].eDType); mgb_assert( dt_mc20.valid(), "output dtype checking failed: invalid dtype returned."); if (!output(i)->dtype().valid()) output(i)->dtype(dt_mc20); } } SymbolVarArray MC20RuntimeOpr::make( SharedBuffer buf, const SymbolVarArray& src, const OperatorNodeConfig& config) { VarNodeArray var_node_array = cg::to_var_node_array(src); auto mc20_runtime_opr = std::make_unique( std::move(buf), INVALID_MODEL_HANDLE, var_node_array, config); auto ret = cg::to_symbol_var_array(src[0].node() ->owner_graph() ->insert_opr(std::move(mc20_runtime_opr)) ->output()); ret.pop_back(); // remove workspace return ret; } SymbolVarArray MC20RuntimeOpr::make( const void* buf, size_t size, const SymbolVarArray& src, const OperatorNodeConfig& config) { mgb_throw_if( !CompNode::get_device_count(CompNode::DeviceType::MC20), SystemError, "can not create MC20RuntimeOpr when mc20 is not " "available"); std::shared_ptr shptr{new uint8_t[size], [](uint8_t* p) { delete[] p; }}; memcpy(shptr.get(), buf, size); SharedBuffer buffer{std::move(shptr), size}; return make(std::move(buffer), src, config); } SymbolVarArray MC20RuntimeOpr::make( SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle, const SymbolVarArray& src, const OperatorNodeConfig& config) { VarNodeArray var_node_array = cg::to_var_node_array(src); auto mc20_runtime_opr = std::make_unique( std::move(buf), model_handle, var_node_array, config); auto ret = cg::to_symbol_var_array(src[0].node() ->owner_graph() ->insert_opr(std::move(mc20_runtime_opr)) ->output()); ret.pop_back(); // remove workspace return ret; } #endif // MGB_MC20 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}