提交 f398c8e6 编写于 作者: M Megvii Engine Team 提交者: XindaH

feat(imperative): add magicmind runtime opr

GitOrigin-RevId: 02ddb886d959104c3aa8320e3ca903e71476c814
上级 f06c890e
......@@ -66,3 +66,15 @@ def atlas_runtime_opr(inputs, data):
op = builtin.AtlasRuntime(data, len(data))
return apply(op, *inputs)
def magicmind_runtime_opr(inputs, data):
r"""Load a serialized MagicMind model as a runtime operator in MegEngine.
Args:
inputs: list of input tensors.
data: the serialized MagicMind model.
"""
op = builtin.MagicMindRuntime(data, len(data))
return apply(op, *inputs)
......@@ -130,3 +130,27 @@ class AtlasRuntimeSubgraph(Module):
def forward(self, *inputs):
return atlas_runtime_opr(inputs, data=self._data)
class MagicMindRuntimeSubgraph(Module):
r"""Load a serialized MagicMindRuntime subgraph.
See :func:`~.magicmind_runtime_opr` for more details.
"""
def __init__(self, data, **kwargs):
super(MagicMindRuntimeSubgraph, self).__init__(**kwargs)
self._data = data
@property
def data(self):
return self._data
@data.setter
def data(self, val):
self._data = np.frombuffer(val, dtype=np.uint8)
def forward(self, *inputs):
return magicmind_runtime_opr(inputs, data=self._data)
/**
* \file imperative/src/impl/ops/magicmind_runtime.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "../op_trait.h"
#include "megbrain/imperative/ops/autogen.h"
#if MGB_CAMBRICON
#include "megbrain/cambricon/magicmind_runtime_opr.h"
namespace mgb::imperative {
namespace {
namespace magicmind_runtime {
auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) {
auto&& op = static_cast<const MagicMindRuntime&>(def);
SymbolVarArray symbol_var_inputs(inputs.begin(), inputs.end());
OperatorNodeConfig config{op.make_name()};
return opr::MagicMindRuntimeOpr::make(
op.buf.c_str(), op.buf_size, symbol_var_inputs, config);
}
OP_TRAIT_REG(MagicMindRuntime, MagicMindRuntime)
.apply_on_var_node(apply_on_var_node)
.fallback();
} // namespace magicmind_runtime
} // namespace
} // namespace mgb::imperative
#endif
......@@ -166,8 +166,8 @@ MagicMindRuntimeOpr::MagicMindRuntimeOpr(
const OperatorNodeConfig& config)
: Super(inputs[0]->owner_graph(), config, "magic_runtime", inputs),
m_allocator{std::move(allocator)},
m_context{nullptr},
m_engine{nullptr},
m_context{nullptr},
m_model{std::move(model)} {
mgb_assert(
inputs[0]->comp_node().device_type() == CompNode::DeviceType::CAMBRICON,
......@@ -207,7 +207,7 @@ void MagicMindRuntimeOpr::scn_do_execute() {
cnrt_env.activate();
std::vector<IRTTensor*> inputs, outputs;
MM_CHECK(CreateInputTensors(m_context.get(), &inputs));
MM_CHECK(CreateInputTensors(m_context.get(), &outputs));
MM_CHECK(CreateOutputTensors(m_context.get(), &outputs));
size_t nr_inputs = input().size();
mgb_assert(nr_inputs == inputs.size());
for (size_t i = 0; i < nr_inputs; ++i) {
......@@ -234,11 +234,9 @@ void MagicMindRuntimeOpr::scn_do_execute() {
MM_CHECK(m_context->SetWorkspace(output().back()->dev_tensor().raw_ptr(), size));
MM_CHECK(m_context->Enqueue(inputs, outputs, cnrt_env.queue));
for (auto&& i : inputs) {
i->SetData(nullptr);
i->Destroy();
}
for (auto&& o : outputs) {
o->SetData(nullptr);
o->Destroy();
}
}
......@@ -260,7 +258,7 @@ void MagicMindRuntimeOpr::get_output_var_shape(
}
std::vector<IRTTensor*> inputs, outputs;
MM_CHECK(CreateInputTensors(m_context.get(), &inputs));
MM_CHECK(CreateInputTensors(m_context.get(), &outputs));
MM_CHECK(CreateOutputTensors(m_context.get(), &outputs));
size_t nr_inputs = input().size();
mgb_assert(nr_inputs == inputs.size());
for (size_t i = 0; i < nr_inputs; ++i) {
......@@ -295,12 +293,11 @@ void MagicMindRuntimeOpr::get_output_var_shape(
false, "static shape infer for MagicMindRuntimeOpr(%s) failed",
cname());
}
return;
for (auto&& i : inputs) {
i->SetData(nullptr);
i->Destroy();
}
for (auto&& o : outputs) {
o->SetData(nullptr);
o->Destroy();
}
}
......@@ -332,10 +329,10 @@ void MagicMindRuntimeOpr::init_output_dtype() {
}
std::vector<DataType> out_dtypes = m_model->GetOutputDataTypes();
mgb_assert(
out_dtypes.size() == output().size(),
out_dtypes.size() + 1 == output().size(),
"output size mismatch(got:%zu,expected:%zu)", out_dtypes.size(),
output().size());
size_t nr_outputs = output().size();
size_t nr_outputs = out_dtypes.size();
for (size_t i = 0; i < nr_outputs; ++i) {
auto dt_mm = mm_dtype_to_mgb_dtype(out_dtypes[i]);
mgb_assert(
......
......@@ -90,8 +90,8 @@ public:
private:
CambriconAllocatorPtr m_allocator;
mutable IContextPtr m_context;
IEnginePtr m_engine;
mutable IContextPtr m_context;
IModelPtr m_model;
};
......
......@@ -135,10 +135,10 @@ public:
constexpr int kh = 3, kw = 3;
constexpr int stride_h = 1, stride_w = 1;
constexpr int pad_h = 1, pad_w = 1;
magicmind::Dims input_dim{{ni, ci, hi, wi}};
magicmind::Dims filter_dim{{co, ci, kh, kw}};
magicmind::Dims input_dim{{ni, hi, wi, ci}};
magicmind::Dims filter_dim{{co, kh, kw, ci}};
magicmind::Dims bias_dim{{co}};
magicmind::Dims add_dim{{no, co, ho, wo}};
magicmind::Dims add_dim{{no, ho, wo, co}};
magicmind::DataType output_datatype = magicmind::DataType::FLOAT32;
// init
......@@ -148,13 +148,13 @@ public:
{
"graph_shape_mutable": {{GRAPH_SHAPE_MUTABLE}},
"precision_config": {
"precision_mode": "qint8_mixed_float16"
"precision_mode": "qint8_mixed_float32"
}
}
)";
replace_all_pairs_inplace(
user_json_config,
{{"{{GRAPH_SHAPE_MUTABLE}}", std::to_string(graph_shape_mutable_)}});
{{"{{GRAPH_SHAPE_MUTABLE}}", graph_shape_mutable_ ? "true" : "false"}});
config->ParseFromString(user_json_config);
auto network = make_mm_unique_ptr(magicmind::CreateINetwork());
magicmind::Range filter_range = {0.0f, 0.0f};
......@@ -278,6 +278,9 @@ public:
std::string buf;
buf.resize(size);
MM_CHECK(model_->SerializeToMemory(reinterpret_cast<void*>(buf.data()), size));
model_.reset();
model_ = std::move(MagicMindRuntimeOpr::make_model_ptr(CreateIModel()));
model_->DeserializeFromMemory(reinterpret_cast<void*>(buf.data()), size);
if (serialize_to_file) {
std::string fname = ssprintf(
"./output/MagicMindRuntimeOprTest.%s.mlu",
......@@ -332,6 +335,10 @@ public:
printf("inference time = %.2fs\n", time / static_cast<float>(runs) * 1e-3);
MGB_CNRT_CHECK(cnrtDestroyNotifier(&start));
MGB_CNRT_CHECK(cnrtDestroyNotifier(&end));
for (auto&& i : input_tensors)
i->Destroy();
for (auto&& o : output_tensors)
o->Destroy();
}
};
} // namespace
......@@ -387,9 +394,9 @@ TEST(TestMagicMindRuntimeOpr, Basic) {
add_output_mlu_ptr, mlu_deleter};
network.infer_model(
{conv_input_mlu_ptr, add_output_mlu_ptr},
{conv_input_mlu_ptr, add_input_mlu_ptr},
{relu_output_mlu_ptr, add_output_mlu_ptr},
{Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
{Dims{{ni, hi, wi, ci}}, Dims{{no, ho, wo, co}}});
// result memory copy cnml->cpu
// memory copy cpu->mlu
......@@ -402,9 +409,9 @@ TEST(TestMagicMindRuntimeOpr, Basic) {
auto buf = network.get_serialized_model(false);
auto x = std::make_shared<HostTensorND>(
cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
cn, TensorLayout{{ni, hi, wi, ci}, dtype::Float32()});
auto add = std::make_shared<HostTensorND>(
cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
cn, TensorLayout{{no, ho, wo, co}, dtype::Float32()});
std::memcpy(
reinterpret_cast<void*>(x->ptr<dt_float32>()), conv_input_cpu_data.data(),
conv_input_count * sizeof(float));
......@@ -418,13 +425,13 @@ TEST(TestMagicMindRuntimeOpr, Basic) {
reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
auto out1 = outs[0];
auto out2 = outs[1];
HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o1(cn, {no, ho, wo, co}, dtype::Float32());
HostTensorND o2(cn, {no, ho, wo, co}, dtype::Float32());
auto func = graph->compile(
{make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
func->execute();
HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o1_mm(cn, {no, ho, wo, co}, dtype::Float32()),
o2_mm(cn, {no, ho, wo, co}, dtype::Float32());
std::memcpy(
o1_mm.ptr<float>(), relu_output_cpu_data.data(),
relu_output_count * sizeof(float));
......@@ -486,9 +493,9 @@ TEST(TestMagicMindRuntimeOpr, InputQInt8) {
add_output_mlu_ptr, mlu_deleter};
network.infer_model(
{conv_input_mlu_ptr, add_output_mlu_ptr},
{conv_input_mlu_ptr, add_input_mlu_ptr},
{relu_output_mlu_ptr, add_output_mlu_ptr},
{Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
{Dims{{ni, hi, wi, ci}}, Dims{{no, ho, wo, co}}});
// result memory copy cnml->cpu
// memory copy cpu->mlu
......@@ -501,9 +508,9 @@ TEST(TestMagicMindRuntimeOpr, InputQInt8) {
auto buf = network.get_serialized_model(false);
auto x = std::make_shared<HostTensorND>(
cn, TensorLayout{{ni, ci, hi, wi}, dtype::QuantizedS8{1.f}});
cn, TensorLayout{{ni, hi, wi, ci}, dtype::QuantizedS8{1.f}});
auto add = std::make_shared<HostTensorND>(
cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
cn, TensorLayout{{no, ho, wo, co}, dtype::Float32()});
std::memcpy(
reinterpret_cast<void*>(x->raw_ptr()), conv_input_cpu_data.data(),
conv_input_count * sizeof(int8_t));
......@@ -517,13 +524,13 @@ TEST(TestMagicMindRuntimeOpr, InputQInt8) {
reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
auto out1 = outs[0];
auto out2 = outs[1];
HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o1(cn, {no, ho, wo, co}, dtype::Float32());
HostTensorND o2(cn, {no, ho, wo, co}, dtype::Float32());
auto func = graph->compile(
{make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
func->execute();
HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o1_mm(cn, {no, ho, wo, co}, dtype::Float32()),
o2_mm(cn, {no, ho, wo, co}, dtype::Float32());
std::memcpy(
o1_mm.ptr<float>(), relu_output_cpu_data.data(),
relu_output_count * sizeof(float));
......@@ -591,9 +598,9 @@ TEST(TestMagicMindRuntimeOpr, GraphShapeMutable) {
add_output_mlu_ptr, mlu_deleter};
network.infer_model(
{conv_input_mlu_ptr, add_output_mlu_ptr},
{conv_input_mlu_ptr, add_input_mlu_ptr},
{relu_output_mlu_ptr, add_output_mlu_ptr},
{Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
{Dims{{ni, hi, wi, ci}}, Dims{{no, ho, wo, co}}});
// result memory copy cnml->cpu
// memory copy cpu->mlu
......@@ -607,11 +614,11 @@ TEST(TestMagicMindRuntimeOpr, GraphShapeMutable) {
auto buf = network.get_serialized_model(true);
auto mkshp = [](int n, int c, int h, int w) {
size_t nz = n, cz = c, hz = h, wz = w;
return TensorShape{nz, cz, hz, wz};
return TensorShape{nz, hz, wz, cz};
};
auto mkly = [](int n, int c, int h, int w, DType dtype) {
size_t nz = n, cz = c, hz = h, wz = w;
return TensorLayout{{nz, cz, hz, wz}, dtype};
return TensorLayout{{nz, hz, wz, cz}, dtype};
};
auto x = std::make_shared<HostTensorND>(
cn, mkly(ni, ci, hi, wi, dtype::Float32()));
......@@ -662,9 +669,9 @@ TEST(TestMagicMindRuntimeOpr, Serialization) {
const int ni = 1, ci = 64, hi = 32, wi = 32;
const int no = 1, co = 64, ho = 32, wo = 32;
auto x = std::make_shared<HostTensorND>(
cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
cn, TensorLayout{{ni, hi, wi, ci}, dtype::Float32()});
auto add = std::make_shared<HostTensorND>(
cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
cn, TensorLayout{{no, ho, wo, co}, dtype::Float32()});
auto graph = ComputingGraph::make();
auto x_ = opr::Host2DeviceCopy::make(*graph, x);
auto add_ = opr::Host2DeviceCopy::make(*graph, add);
......@@ -693,11 +700,11 @@ TEST(TestMagicMindRuntimeOpr, Profiling) {
MMNetwork network(cn, magicmind::DataType::FLOAT32, true);
auto buf = network.get_serialized_model(false);
const int ni = 8, ci = 64, hi = 32, wi = 32;
const int no = 1, co = 64, ho = 32, wo = 32;
const int no = 8, co = 64, ho = 32, wo = 32;
HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN> gen(0, 1);
auto x = gen({ni, ci, hi, wi}, cn);
auto add = gen({no, co, ho, wo}, cn);
auto x = gen({ni, hi, wi, ci}, cn);
auto add = gen({no, ho, wo, co}, cn);
auto graph = ComputingGraph::make();
GraphProfiler profiler{graph.get()};
......@@ -708,8 +715,8 @@ TEST(TestMagicMindRuntimeOpr, Profiling) {
auto out1 = outs[0];
auto out2 = outs[1];
graph->options().var_sanity_check_first_run = false;
HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o1(cn, {no, ho, wo, co}, dtype::Float32());
HostTensorND o2(cn, {no, ho, wo, co}, dtype::Float32());
auto func = graph->compile(
{make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
func->execute();
......@@ -768,9 +775,9 @@ TEST(TestMagicMindRuntimeOpr, CrossCNCopy) {
add_output_mlu_ptr, mlu_deleter};
network.infer_model(
{conv_input_mlu_ptr, add_output_mlu_ptr},
{conv_input_mlu_ptr, add_input_mlu_ptr},
{relu_output_mlu_ptr, add_output_mlu_ptr},
{Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
{Dims{{ni, hi, wi, ci}}, Dims{{no, ho, wo, co}}});
// result memory copy cnml->cpu
// memory copy cpu->mlu
......@@ -784,9 +791,9 @@ TEST(TestMagicMindRuntimeOpr, CrossCNCopy) {
auto cn_cpu = CompNode::load("cpu0");
auto buf = network.get_serialized_model(false);
auto x = std::make_shared<HostTensorND>(
cn_cpu, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
cn_cpu, TensorLayout{{ni, hi, wi, ci}, dtype::Float32()});
auto add = std::make_shared<HostTensorND>(
cn_cpu, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
cn_cpu, TensorLayout{{no, ho, wo, co}, dtype::Float32()});
std::memcpy(
reinterpret_cast<void*>(x->ptr<dt_float32>()), conv_input_cpu_data.data(),
conv_input_count * sizeof(float));
......@@ -802,13 +809,13 @@ TEST(TestMagicMindRuntimeOpr, CrossCNCopy) {
reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
auto out1 = outs[0];
auto out2 = outs[1];
HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o1(CompNode::default_cpu(), {no, ho, wo, co}, dtype::Float32());
HostTensorND o2(CompNode::default_cpu(), {no, ho, wo, co}, dtype::Float32());
auto func = graph->compile(
{make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
func->execute();
HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
HostTensorND o1_mm(cn, {no, ho, wo, co}, dtype::Float32()),
o2_mm(cn, {no, ho, wo, co}, dtype::Float32());
std::memcpy(
o1_mm.ptr<float>(), relu_output_cpu_data.data(),
relu_output_count * sizeof(float));
......
......@@ -388,6 +388,13 @@ def CambriconRuntime: MgbHashableOp<"CambriconRuntime"> {
);
}
def MagicMindRuntime: MgbHashableOp<"MagicMindRuntime"> {
let extraArguments = (ins
MgbStringAttr:$buf,
MgbSizeTAddr:$buf_size
);
}
def CvtColor: MgbHashableOp<"CvtColor", [CvtColorParam]>;
def CheckNonFinite: MgbHashableOp<"CheckNonFinite", [EmptyParam]>;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册