提交 11556cba 编写于 作者: M Megvii Engine Team

feat(megbrain): add mc20 bazel

GitOrigin-RevId: 96d57656e49e3ef72b26a3a9a464546b23c3fca8
上级 d2c37fe4
find_path(MC20_ROOT_DIR
include/ax_interpreter_external_api.h
PATHS
${PROJECT_SOURCE_DIR}/third_party/mc20/
$ENV{MC20DIR}
)
if(${MC20_ROOT_DIR} STREQUAL "MC20_ROOT_DIR-NOTFOUND")
message(FATAL_ERROR "Can not find MC20")
endif()
message(STATUS "Build with MC20 in ${MC20_ROOT_DIR}")
find_path(MC20_INCLUDE_DIR
ax_interpreter_external_api.h
PATHS
${MC20_ROOT_DIR}/include
${INCLUDE_INSTALL_DIR}
)
add_library(libmc20 INTERFACE IMPORTED)
find_library(MC20_LIBRARY
NAMES libax_interpreter_external.x86.a
PATHS ${MC20_ROOT_DIR}/lib/)
if(${MC20_LIBRARY} STREQUAL "MC20_LIBRARY-NOTFOUND")
message(FATAL_ERROR "Can not find MC20 library")
endif()
target_link_libraries(libmc20 INTERFACE ${MC20_LIBRARY})
target_include_directories(libmc20 INTERFACE ${MC20_INCLUDE_DIR})
/**
* \file src/opr/impl/mc20_runtime_op.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megbrain/opr/mc20_runtime_op.h"
#include "megbrain/common.h"
#include "megbrain/graph/event.h"
#include "megdnn/dtype.h"
#include <memory>
#if MGB_MC20
using namespace mgb;
using namespace opr;
namespace {
TensorShape mc20_shape_to_mgb_shape(AX_NPU_SDK_EX_TENSOR_META_T tensor_meta) {
TensorShape ret;
ret.ndim = tensor_meta.nShapeNDim;
for (size_t i = 0; i < ret.ndim; ++i) {
ret[i] = tensor_meta.pShape[i];
}
return ret;
}
DType mc20_dtype_to_mgb_dtype(AX_NPU_SDK_EX_ADV_TENSOR_DTYPE data_type) {
switch (data_type) {
case AX_NPU_TDT_UINT8:
return dtype::Uint8();
case AX_NPU_TDT_FLOAT32:
return dtype::Float32();
case AX_NPU_TDT_INT16:
return dtype::Int16();
case AX_NPU_TDT_INT32:
return dtype::Int32();
default:
mgb_throw(
MegBrainError, "MC20DataType %d is not supported by MegBrain.",
static_cast<int>(data_type));
}
}
}; // namespace
constexpr AX_NPU_SDK_EX_HANDLE_T MC20RuntimeOpr::INVALID_MODEL_HANDLE;
/* ====================== MC20RuntimeOpr ==================== */
MGB_DYN_TYPE_OBJ_FINAL_IMPL(MC20RuntimeOpr);
MC20RuntimeOpr::MC20RuntimeOpr(
SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle,
const VarNodeArray& inputs, const OperatorNodeConfig& config)
: Super(inputs[0]->owner_graph(), config, "mc20_runtime", inputs),
m_buffer{std::move(buf)},
m_model_handle(model_handle) {
mgb_assert(
inputs[0]->comp_node().device_type() == CompNode::DeviceType::MC20,
"MC20RuntimeOpr can only be used on mc20 comp node; "
"got %s",
inputs[0]->comp_node().to_string().c_str());
for (auto i : inputs) {
add_input({i});
}
if (m_model_handle == INVALID_MODEL_HANDLE) {
MGB_MC20_CHECK(AX_NPU_SDK_EX_Create_handle(
&m_model_handle, m_buffer.data(), m_buffer.size()));
m_is_model_holder = true;
}
const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
size_t nr_outputs = io_info->nOutputSize;
bool has_workspace = false;
if (nr_outputs == 1) {
const auto& tensor_meta = *(io_info->pOutputs[0].pTensorMeta);
add_output(std::string(reinterpret_cast<char*>(tensor_meta.pName)));
if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
mgb_assert(tensor_meta.nInnerSize > 0);
has_workspace = true;
}
} else {
for (size_t i = 0; i < nr_outputs; ++i) {
const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
add_output(std::string(reinterpret_cast<char*>(tensor_meta.pName)));
if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
mgb_assert(tensor_meta.nInnerSize > 0);
has_workspace = true;
}
}
}
mgb_assert(has_workspace, "Currently only support model with cpu tail");
//! \warning There is no interface in MC20 to get the batch size of
//! model.MC20 supports multi-batch by changing the input of n-batch to n
//! 1-batch input.
mgb_assert(
io_info->nInputSize % inputs.size() == 0,
"The number of inputs in the neu model should be multiple of "
"the number of inputs in megbrain, but got %zu(neu model) vs "
"%zu(mgb model)",
io_info->nInputSize, inputs.size());
m_model_batch = reinterpret_cast<size_t>(io_info->nInputSize / inputs.size());
add_equivalence_component<mgb::ScalarHash<const void*>>(m_buffer.data());
cg::add_workspace_output(this);
};
MC20RuntimeOpr::~MC20RuntimeOpr() {
if (m_is_model_holder) {
MGB_MC20_CHECK(AX_NPU_SDK_EX_Destroy_handle(m_model_handle));
}
}
void MC20RuntimeOpr::execute_mc20() {
auto&& mc20_env = CompNodeEnv::from_comp_node(input(0)->comp_node()).mc20_env();
mc20_env.activate();
const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
AX_NPU_SDK_EX_IO_T npu_io;
memset(&npu_io, 0, sizeof(npu_io));
size_t batch_size = input(0)->dev_tensor().layout().shape[0];
for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx += m_model_batch) {
//! prepare input
npu_io.nInputSize = io_info->nInputSize;
auto inputs = std::make_unique<AX_NPU_SDK_EX_BUF_T[]>(npu_io.nInputSize);
npu_io.pInputs = inputs.get();
for (size_t i = 0; i < npu_io.nInputSize; i++) {
// get input addr info
size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
AX_VOID* p_virtual_addr = input(inp_idx)->dev_tensor().raw_ptr();
AX_U64 phy_addr = MC20MemoryManager::Instance().get_phyaddr(p_virtual_addr);
auto nr_bytes_per_batch =
input(inp_idx)->layout().span().dist_byte() / batch_size;
// add batch offset
p_virtual_addr = reinterpret_cast<AX_VOID*>(
reinterpret_cast<AX_U64>(p_virtual_addr) +
nr_bytes_per_batch * (batch_idx + i % m_model_batch));
phy_addr += nr_bytes_per_batch * (batch_idx + i % m_model_batch);
MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
phy_addr, p_virtual_addr, nr_bytes_per_batch, phy_addr,
p_virtual_addr, nr_bytes_per_batch, &npu_io.pInputs[i]));
}
//! prepare output
npu_io.nOutputSize = io_info->nOutputSize;
auto outputs = std::make_unique<AX_NPU_SDK_EX_BUF_T[]>(npu_io.nOutputSize);
npu_io.pOutputs = outputs.get();
AX_U32 offset = 0;
AX_VOID* inner_virtual_addr_start = nullptr;
AX_U64 inner_phy_addr_start = 0;
// get innder addr form workspace
inner_virtual_addr_start = output(npu_io.nOutputSize)->dev_tensor().raw_ptr();
inner_phy_addr_start =
MC20MemoryManager::Instance().get_phyaddr(inner_virtual_addr_start);
for (size_t i = 0; i < npu_io.nOutputSize; i++) {
// get output addr info
AX_VOID* p_virtual_addr = output(i)->dev_tensor().raw_ptr();
AX_U64 phy_addr = 0;
auto nr_bytes_per_batch =
output(i)->layout().span().dist_byte() / batch_size;
// add batch offset
p_virtual_addr = reinterpret_cast<AX_VOID*>(
reinterpret_cast<AX_U64>(p_virtual_addr) +
nr_bytes_per_batch * batch_idx);
phy_addr += nr_bytes_per_batch * batch_idx;
const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
if (tensor_meta.eMemoryType == AX_NPU_MT_PHYSICAL) {
MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
phy_addr, p_virtual_addr, nr_bytes_per_batch, phy_addr,
p_virtual_addr, nr_bytes_per_batch, &npu_io.pOutputs[i]));
} else if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
auto p_inner_virtual_addr = reinterpret_cast<AX_VOID*>(
reinterpret_cast<AX_U64>(inner_virtual_addr_start) + offset);
auto innerphy_addr = inner_phy_addr_start + offset;
MGB_MC20_CHECK(AX_NPU_SDK_EX_ADV_Make_io_buffer(
phy_addr, p_virtual_addr, nr_bytes_per_batch, innerphy_addr,
p_inner_virtual_addr, tensor_meta.nInnerSize,
&npu_io.pOutputs[i]));
offset += tensor_meta.nInnerSize;
}
}
MGB_MC20_CHECK(AX_NPU_SDK_EX_Run_task_sync(m_model_handle, &npu_io));
}
}
void MC20RuntimeOpr::init_output_comp_node() {
//! set output to cpu compnode if has cpu tail
const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
CompNode input_cn;
for (auto&& i : input()) {
if (!input_cn.valid()) {
input_cn = i->comp_node();
} else {
mgb_assert(
input_cn.mem_node() == i->comp_node().mem_node(),
"opr %s{%s} requires all input to be on the same memory "
"node expect=%s cur_var=%s cur_cn=%s",
this->cname(), this->dyn_typeinfo()->name,
input_cn.to_string().c_str(), i->cname(),
i->comp_node().to_string().c_str());
}
}
for (size_t i = 0; i < io_info->nOutputSize; i++) {
//! compnode of the var should be default_cpu as the output will be
//! proxy to user
output(i)->comp_node(CompNode::default_cpu());
}
//! the last output is workspace, which should be the same as input
output(io_info->nOutputSize)->comp_node(input_cn);
}
MC20RuntimeOpr::NodeProp* MC20RuntimeOpr::do_make_node_prop() const {
auto ret = Super::do_make_node_prop();
ret->add_flag(NodeProp::Flag::CROSS_COMP_NODE_MEMORY);
return ret;
}
void MC20RuntimeOpr::do_execute(ExecEnv& env) {
CompNode cn = output(0)->comp_node();
auto runner = [this, cn]() {
this->owner_graph()->event().signal_inplace<cg::event::BeforeKernel>(this, cn);
cn.activate();
execute_mc20();
this->owner_graph()->event().signal_inplace<cg::event::AfterKernel>(this, cn);
};
env.dispatch_on_comp_node(cn, runner);
// Send BeforeKernel/AfterKernel event on every different comp_node
ThinHashSet<mgb::CompNode> st = cg::get_opr_comp_node_set(this);
for (auto cn : st) {
auto send_event = [this, cn]() {
this->owner_graph()->event().signal_inplace<cg::event::BeforeKernel>(
this, cn);
this->owner_graph()->event().signal_inplace<cg::event::AfterKernel>(
this, cn);
};
env.dispatch_on_comp_node(cn, send_event);
}
}
void MC20RuntimeOpr::on_output_comp_node_stream_changed() {
mgb_throw(SystemError, "comp node of output should not change");
}
void MC20RuntimeOpr::get_output_var_shape(
const TensorShapeArray& inp_shape, TensorShapeArray& out_shape) const {
const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
size_t nr_inputs = io_info->nInputSize;
for (size_t i = 0; i < nr_inputs; ++i) {
const auto& tensor_meta = *(io_info->pInputs[i].pTensorMeta);
auto model_shape = mc20_shape_to_mgb_shape(tensor_meta);
size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
// enable mutibatch
mgb_assert(
inp_shape[inp_idx][0] % model_shape[0] == 0 &&
(inp_shape[inp_idx][0] / model_shape[0]) % m_model_batch == 0,
"input %zu batch is %zu, while model's input batch is %zu", i,
inp_shape[inp_idx][0], model_shape[0]);
model_shape[0] = inp_shape[inp_idx][0];
mgb_assert(
model_shape.eq_shape(inp_shape[inp_idx]),
"shape mismatch of input %zu, expected: %s got: %s", i,
model_shape.to_string().c_str(),
inp_shape[inp_idx].to_string().c_str());
}
size_t input_batch = (io_info->pInputs[0].pTensorMeta)->pShape[0];
//! \warning mc20 sdk implement multi-batch by breaking an n-batch input up
//! into n 1-batch inputs
mgb_assert(input_batch == 1, "input batch: %d, net's input batch: 1", input_batch);
AX_U32 workspace_size = 0;
for (size_t i = 0; i < io_info->nOutputSize; ++i) {
const auto& tensor_meta = *(io_info->pOutputs[i].pTensorMeta);
out_shape[i] = mc20_shape_to_mgb_shape(tensor_meta);
// enable mutibatch
out_shape[i][0] =
out_shape[i][0] * inp_shape[0][0] / input_batch / m_model_batch;
if (tensor_meta.eMemoryType == AX_NPU_MT_VIRTUAL) {
workspace_size += tensor_meta.nInnerSize;
}
}
out_shape.back() = {workspace_size};
}
void MC20RuntimeOpr::add_input_layout_constraint() {
//! default contiguous
for (auto i : input()) {
i->add_layout_constraint_contiguous();
}
}
void MC20RuntimeOpr::init_output_dtype() {
DType dt_mc20, dt_input;
const AX_NPU_SDK_EX_ADV_IO_INFO_T* io_info =
AX_NPU_SDK_EX_ADV_Get_io_info(m_model_handle);
for (size_t i = 0; i < io_info->nInputSize; ++i) {
dt_mc20 = mc20_dtype_to_mgb_dtype(io_info->pInputs[i].eDType);
size_t inp_idx = reinterpret_cast<size_t>(i / m_model_batch);
dt_input = input(inp_idx)->dtype();
mgb_assert(
dt_mc20.valid() && dt_input.valid() &&
dt_mc20.enumv() == dt_input.enumv(),
"dtype mismatch of input %zu: expected %s, "
"got %s",
i, dt_mc20.name(), dt_input.name());
}
for (size_t i = 0; i < io_info->nOutputSize; ++i) {
dt_mc20 = mc20_dtype_to_mgb_dtype(io_info->pOutputs[i].eDType);
mgb_assert(
dt_mc20.valid(),
"output dtype checking failed: invalid dtype returned.");
if (!output(i)->dtype().valid())
output(i)->dtype(dt_mc20);
}
}
SymbolVarArray MC20RuntimeOpr::make(
SharedBuffer buf, const SymbolVarArray& src, const OperatorNodeConfig& config) {
VarNodeArray var_node_array = cg::to_var_node_array(src);
auto mc20_runtime_opr = std::make_unique<MC20RuntimeOpr>(
std::move(buf), INVALID_MODEL_HANDLE, var_node_array, config);
auto ret = cg::to_symbol_var_array(src[0].node()
->owner_graph()
->insert_opr(std::move(mc20_runtime_opr))
->output());
ret.pop_back(); // remove workspace
return ret;
}
SymbolVarArray MC20RuntimeOpr::make(
const void* buf, size_t size, const SymbolVarArray& src,
const OperatorNodeConfig& config) {
mgb_throw_if(
!CompNode::get_device_count(CompNode::DeviceType::MC20), SystemError,
"can not create MC20RuntimeOpr when mc20 is not "
"available");
std::shared_ptr<uint8_t> shptr{new uint8_t[size], [](uint8_t* p) { delete[] p; }};
memcpy(shptr.get(), buf, size);
SharedBuffer buffer{std::move(shptr), size};
return make(std::move(buffer), src, config);
}
SymbolVarArray MC20RuntimeOpr::make(
SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle,
const SymbolVarArray& src, const OperatorNodeConfig& config) {
VarNodeArray var_node_array = cg::to_var_node_array(src);
auto mc20_runtime_opr = std::make_unique<MC20RuntimeOpr>(
std::move(buf), model_handle, var_node_array, config);
auto ret = cg::to_symbol_var_array(src[0].node()
->owner_graph()
->insert_opr(std::move(mc20_runtime_opr))
->output());
ret.pop_back(); // remove workspace
return ret;
}
#endif // MGB_MC20
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
decl_raw_opr(
'mc20_runtime',
desc='create an operator that could load and run mc20 model',
inputs=[
Doc('inputs', 'input vars', 'list of :class:`.SymbolVar`'),
Doc('data_bytes', 'serialized mc20 model'),
],
body=[
'assert isinstance(data_bytes, bytes), '
'"data must be bytes; got {}".format(type(data_bytes))',
'output = _mgb._Opr.mc20_runtime(inputs, data_bytes, config)',
'cvt_result_kwargs["explode_single"] = False',
],
)
# vim: ft=python
/**
* \file src/opr/impl/mc20_runtime_op.sereg.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "megbrain/opr/mc20_runtime_op.h"
#include "megbrain/serialization/sereg.h"
#if MGB_MC20
namespace mgb {
using MC20RuntimeOpr = opr::MC20RuntimeOpr;
namespace serialization {
template <>
struct OprLoadDumpImpl<MC20RuntimeOpr, 0> {
static void dump(OprDumpContext& ctx, const cg::OperatorNodeBase& opr_) {
auto&& opr = opr_.cast_final_safe<opr::MC20RuntimeOpr>();
auto&& buf = opr.buffer();
auto&& name = opr.name();
ctx.dump_buf_with_len(buf.data(), buf.size());
ctx.dump_buf_with_len(name.c_str(), name.size());
}
static cg::OperatorNodeBase* load(
OprLoadContext& ctx, const cg::VarNodeArray& inputs,
const OperatorNodeConfig& config) {
inputs.at(0)->comp_node().activate();
auto buf = ctx.load_shared_buf_with_len();
auto name = ctx.load_shared_buf_with_len();
std::string c_name(reinterpret_cast<const char*>(name.data()), name.size());
OperatorNodeConfig& c_config = const_cast<OperatorNodeConfig&>(config);
c_config.name(c_name);
return opr::MC20RuntimeOpr::make(
std::move(buf), cg::to_symbol_var_array(inputs), c_config)
.at(0)
.node()
->owner_opr();
}
};
} // namespace serialization
namespace opr {
cg::OperatorNodeBase* opr_shallow_copy_mc20_runtime_opr(
const serialization::OprShallowCopyContext& ctx,
const cg::OperatorNodeBase& opr_, const VarNodeArray& inputs,
const OperatorNodeConfig& config) {
MGB_MARK_USED_VAR(ctx);
auto&& opr = opr_.cast_final_safe<MC20RuntimeOpr>();
return MC20RuntimeOpr::make(
opr.buffer(), opr.model_handle(), cg::to_symbol_var_array(inputs),
config)
.at(0)
.node()
->owner_opr();
}
MGB_SEREG_OPR(MC20RuntimeOpr, 0);
MGB_REG_OPR_SHALLOW_COPY(MC20RuntimeOpr, opr_shallow_copy_mc20_runtime_opr);
} // namespace opr
} // namespace mgb
#endif
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
/**
* \file src/opr/include/megbrain/opr/mc20_runtime_op.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#pragma once
#include <memory>
#include "megbrain/comp_node_env.h"
#include "megbrain/graph.h"
#include "megbrain/graph/operator_node.h"
#include "megbrain/serialization/file.h"
#include "megdnn/thin/function.h"
#if MGB_MC20
#include "megbrain/mc20/mc20_memory_manager.h"
namespace mgb {
namespace opr {
MGB_DEFINE_OPR_CLASS(
MC20RuntimeOpr, cg::OutshapePureByInshapeOpr<cg::OperatorNodeBase>) // {
public:
using SharedBuffer = mgb::serialization::SharedBuffer;
void do_execute(ExecEnv& env) override;
void get_output_var_shape(
const TensorShapeArray& inp_shape,
TensorShapeArray& out_shape) const override;
void add_input_layout_constraint() override;
void init_output_dtype() override;
void init_output_comp_node() override;
void on_output_comp_node_stream_changed() override;
/**
* \brief create MC20RuntimeOpr with buf
*/
MC20RuntimeOpr(
SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T m_model_handle,
const VarNodeArray& inputs, const OperatorNodeConfig& config);
~MC20RuntimeOpr();
const SharedBuffer& buffer() const { return m_buffer; }
AX_NPU_SDK_EX_HANDLE_T model_handle() const { return m_model_handle; }
static SymbolVarArray make(
SharedBuffer buf, const SymbolVarArray& src,
const OperatorNodeConfig& config = {});
static SymbolVarArray make(
const void* buf, size_t size, const SymbolVarArray& src,
const OperatorNodeConfig& config = {});
static SymbolVarArray make(
SharedBuffer buf, AX_NPU_SDK_EX_HANDLE_T model_handle,
const SymbolVarArray& src, const OperatorNodeConfig& config = {});
private:
NodeProp* do_make_node_prop() const override;
void execute_mc20();
size_t m_model_batch;
SharedBuffer m_buffer;
constexpr static AX_NPU_SDK_EX_HANDLE_T INVALID_MODEL_HANDLE = nullptr;
AX_NPU_SDK_EX_HANDLE_T m_model_handle = INVALID_MODEL_HANDLE;
//! if set true, it will release model
bool m_is_model_holder = false;
}; // namespace opr
} // namespace opr
} // namespace mgb
#endif // MGB_MC20
// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册