diff --git a/imperative/python/megengine/functional/external.py b/imperative/python/megengine/functional/external.py index 03455d4a5a21d9aae54c7a38c4ef0e6f9111c98a..3286bbc4b58441db0dd4c92ef81dd478c6bbeb74 100644 --- a/imperative/python/megengine/functional/external.py +++ b/imperative/python/megengine/functional/external.py @@ -66,3 +66,15 @@ def atlas_runtime_opr(inputs, data): op = builtin.AtlasRuntime(data, len(data)) return apply(op, *inputs) + + +def magicmind_runtime_opr(inputs, data): + r"""Load a serialized MagicMind model as a runtime operator in MegEngine. + + Args: + inputs: list of input tensors. + data: the serialized MagicMind model. + """ + + op = builtin.MagicMindRuntime(data, len(data)) + return apply(op, *inputs) diff --git a/imperative/python/megengine/module/external.py b/imperative/python/megengine/module/external.py index 8a8d0cbcdb0a7927f72604a0a614fbb33cb44bb8..949b2bde753eff90c185ea16e04ebff55d9faa8e 100644 --- a/imperative/python/megengine/module/external.py +++ b/imperative/python/megengine/module/external.py @@ -130,3 +130,27 @@ class AtlasRuntimeSubgraph(Module): def forward(self, *inputs): return atlas_runtime_opr(inputs, data=self._data) + +class MagicMindRuntimeSubgraph(Module): + r"""Load a serialized MagicMindRuntime subgraph. + + See :func:`~.magicmind_runtime_opr` for more details. + """ + + def __init__(self, data, **kwargs): + super(MagicMindRuntimeSubgraph, self).__init__(**kwargs) + self._data = data + + @property + def data(self): + return self._data + + @data.setter + def data(self, val): + self._data = np.frombuffer(val, dtype=np.uint8) + + def forward(self, *inputs): + return magicmind_runtime_opr(inputs, data=self._data) + + + diff --git a/imperative/src/impl/ops/magicmind_runtime.cpp b/imperative/src/impl/ops/magicmind_runtime.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9bb54605cebc17dd0b6f274ac8f80afc7aab2f5c --- /dev/null +++ b/imperative/src/impl/ops/magicmind_runtime.cpp @@ -0,0 +1,36 @@ +/** + * \file imperative/src/impl/ops/magicmind_runtime.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "../op_trait.h" +#include "megbrain/imperative/ops/autogen.h" + +#if MGB_CAMBRICON +#include "megbrain/cambricon/magicmind_runtime_opr.h" +namespace mgb::imperative { + +namespace { +namespace magicmind_runtime { + +auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { + auto&& op = static_cast(def); + SymbolVarArray symbol_var_inputs(inputs.begin(), inputs.end()); + OperatorNodeConfig config{op.make_name()}; + return opr::MagicMindRuntimeOpr::make( + op.buf.c_str(), op.buf_size, symbol_var_inputs, config); +} +OP_TRAIT_REG(MagicMindRuntime, MagicMindRuntime) + .apply_on_var_node(apply_on_var_node) + .fallback(); +} // namespace magicmind_runtime +} // namespace + +} // namespace mgb::imperative +#endif diff --git a/src/cambricon/impl/magicmind_runtime_opr.cpp b/src/cambricon/impl/magicmind_runtime_opr.cpp index c748f11d910c44e12f63463dc78befc6b1e81996..44aa894cb4406449a788272c9425ba71a098c360 100644 --- a/src/cambricon/impl/magicmind_runtime_opr.cpp +++ b/src/cambricon/impl/magicmind_runtime_opr.cpp @@ -166,8 +166,8 @@ MagicMindRuntimeOpr::MagicMindRuntimeOpr( const OperatorNodeConfig& config) : Super(inputs[0]->owner_graph(), config, "magic_runtime", inputs), m_allocator{std::move(allocator)}, - m_context{nullptr}, m_engine{nullptr}, + m_context{nullptr}, m_model{std::move(model)} { mgb_assert( inputs[0]->comp_node().device_type() == CompNode::DeviceType::CAMBRICON, @@ -207,7 +207,7 @@ void MagicMindRuntimeOpr::scn_do_execute() { cnrt_env.activate(); std::vector inputs, outputs; MM_CHECK(CreateInputTensors(m_context.get(), &inputs)); - MM_CHECK(CreateInputTensors(m_context.get(), &outputs)); + MM_CHECK(CreateOutputTensors(m_context.get(), &outputs)); size_t nr_inputs = input().size(); mgb_assert(nr_inputs == inputs.size()); for (size_t i = 0; i < nr_inputs; ++i) { @@ -234,11 +234,9 @@ void MagicMindRuntimeOpr::scn_do_execute() { MM_CHECK(m_context->SetWorkspace(output().back()->dev_tensor().raw_ptr(), size)); MM_CHECK(m_context->Enqueue(inputs, outputs, cnrt_env.queue)); for (auto&& i : inputs) { - i->SetData(nullptr); i->Destroy(); } for (auto&& o : outputs) { - o->SetData(nullptr); o->Destroy(); } } @@ -260,7 +258,7 @@ void MagicMindRuntimeOpr::get_output_var_shape( } std::vector inputs, outputs; MM_CHECK(CreateInputTensors(m_context.get(), &inputs)); - MM_CHECK(CreateInputTensors(m_context.get(), &outputs)); + MM_CHECK(CreateOutputTensors(m_context.get(), &outputs)); size_t nr_inputs = input().size(); mgb_assert(nr_inputs == inputs.size()); for (size_t i = 0; i < nr_inputs; ++i) { @@ -295,12 +293,11 @@ void MagicMindRuntimeOpr::get_output_var_shape( false, "static shape infer for MagicMindRuntimeOpr(%s) failed", cname()); } + return; for (auto&& i : inputs) { - i->SetData(nullptr); i->Destroy(); } for (auto&& o : outputs) { - o->SetData(nullptr); o->Destroy(); } } @@ -332,10 +329,10 @@ void MagicMindRuntimeOpr::init_output_dtype() { } std::vector out_dtypes = m_model->GetOutputDataTypes(); mgb_assert( - out_dtypes.size() == output().size(), + out_dtypes.size() + 1 == output().size(), "output size mismatch(got:%zu,expected:%zu)", out_dtypes.size(), output().size()); - size_t nr_outputs = output().size(); + size_t nr_outputs = out_dtypes.size(); for (size_t i = 0; i < nr_outputs; ++i) { auto dt_mm = mm_dtype_to_mgb_dtype(out_dtypes[i]); mgb_assert( diff --git a/src/cambricon/include/megbrain/cambricon/magicmind_runtime_opr.h b/src/cambricon/include/megbrain/cambricon/magicmind_runtime_opr.h index 0695099bfc4b95540d02ce30892fe59e5b073c78..b72cf87162679f36dcabe81ef029aa50eb9b14fc 100644 --- a/src/cambricon/include/megbrain/cambricon/magicmind_runtime_opr.h +++ b/src/cambricon/include/megbrain/cambricon/magicmind_runtime_opr.h @@ -90,8 +90,8 @@ public: private: CambriconAllocatorPtr m_allocator; - mutable IContextPtr m_context; IEnginePtr m_engine; + mutable IContextPtr m_context; IModelPtr m_model; }; diff --git a/src/cambricon/test/magicmind_runtime_opr.cpp b/src/cambricon/test/magicmind_runtime_opr.cpp index b6f6461474acce38b88e96db5c046dcaad5ce2a2..55e3ebc50f1a3da2e8d86283a9221485968555b3 100644 --- a/src/cambricon/test/magicmind_runtime_opr.cpp +++ b/src/cambricon/test/magicmind_runtime_opr.cpp @@ -135,10 +135,10 @@ public: constexpr int kh = 3, kw = 3; constexpr int stride_h = 1, stride_w = 1; constexpr int pad_h = 1, pad_w = 1; - magicmind::Dims input_dim{{ni, ci, hi, wi}}; - magicmind::Dims filter_dim{{co, ci, kh, kw}}; + magicmind::Dims input_dim{{ni, hi, wi, ci}}; + magicmind::Dims filter_dim{{co, kh, kw, ci}}; magicmind::Dims bias_dim{{co}}; - magicmind::Dims add_dim{{no, co, ho, wo}}; + magicmind::Dims add_dim{{no, ho, wo, co}}; magicmind::DataType output_datatype = magicmind::DataType::FLOAT32; // init @@ -148,13 +148,13 @@ public: { "graph_shape_mutable": {{GRAPH_SHAPE_MUTABLE}}, "precision_config": { - "precision_mode": "qint8_mixed_float16" + "precision_mode": "qint8_mixed_float32" } } )"; replace_all_pairs_inplace( user_json_config, - {{"{{GRAPH_SHAPE_MUTABLE}}", std::to_string(graph_shape_mutable_)}}); + {{"{{GRAPH_SHAPE_MUTABLE}}", graph_shape_mutable_ ? "true" : "false"}}); config->ParseFromString(user_json_config); auto network = make_mm_unique_ptr(magicmind::CreateINetwork()); magicmind::Range filter_range = {0.0f, 0.0f}; @@ -278,6 +278,9 @@ public: std::string buf; buf.resize(size); MM_CHECK(model_->SerializeToMemory(reinterpret_cast(buf.data()), size)); + model_.reset(); + model_ = std::move(MagicMindRuntimeOpr::make_model_ptr(CreateIModel())); + model_->DeserializeFromMemory(reinterpret_cast(buf.data()), size); if (serialize_to_file) { std::string fname = ssprintf( "./output/MagicMindRuntimeOprTest.%s.mlu", @@ -332,6 +335,10 @@ public: printf("inference time = %.2fs\n", time / static_cast(runs) * 1e-3); MGB_CNRT_CHECK(cnrtDestroyNotifier(&start)); MGB_CNRT_CHECK(cnrtDestroyNotifier(&end)); + for (auto&& i : input_tensors) + i->Destroy(); + for (auto&& o : output_tensors) + o->Destroy(); } }; } // namespace @@ -387,9 +394,9 @@ TEST(TestMagicMindRuntimeOpr, Basic) { add_output_mlu_ptr, mlu_deleter}; network.infer_model( - {conv_input_mlu_ptr, add_output_mlu_ptr}, + {conv_input_mlu_ptr, add_input_mlu_ptr}, {relu_output_mlu_ptr, add_output_mlu_ptr}, - {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); + {Dims{{ni, hi, wi, ci}}, Dims{{no, ho, wo, co}}}); // result memory copy cnml->cpu // memory copy cpu->mlu @@ -402,9 +409,9 @@ TEST(TestMagicMindRuntimeOpr, Basic) { auto buf = network.get_serialized_model(false); auto x = std::make_shared( - cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); + cn, TensorLayout{{ni, hi, wi, ci}, dtype::Float32()}); auto add = std::make_shared( - cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); + cn, TensorLayout{{no, ho, wo, co}, dtype::Float32()}); std::memcpy( reinterpret_cast(x->ptr()), conv_input_cpu_data.data(), conv_input_count * sizeof(float)); @@ -418,13 +425,13 @@ TEST(TestMagicMindRuntimeOpr, Basic) { reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; - HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); - HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o1(cn, {no, ho, wo, co}, dtype::Float32()); + HostTensorND o2(cn, {no, ho, wo, co}, dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); - HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), - o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o1_mm(cn, {no, ho, wo, co}, dtype::Float32()), + o2_mm(cn, {no, ho, wo, co}, dtype::Float32()); std::memcpy( o1_mm.ptr(), relu_output_cpu_data.data(), relu_output_count * sizeof(float)); @@ -486,9 +493,9 @@ TEST(TestMagicMindRuntimeOpr, InputQInt8) { add_output_mlu_ptr, mlu_deleter}; network.infer_model( - {conv_input_mlu_ptr, add_output_mlu_ptr}, + {conv_input_mlu_ptr, add_input_mlu_ptr}, {relu_output_mlu_ptr, add_output_mlu_ptr}, - {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); + {Dims{{ni, hi, wi, ci}}, Dims{{no, ho, wo, co}}}); // result memory copy cnml->cpu // memory copy cpu->mlu @@ -501,9 +508,9 @@ TEST(TestMagicMindRuntimeOpr, InputQInt8) { auto buf = network.get_serialized_model(false); auto x = std::make_shared( - cn, TensorLayout{{ni, ci, hi, wi}, dtype::QuantizedS8{1.f}}); + cn, TensorLayout{{ni, hi, wi, ci}, dtype::QuantizedS8{1.f}}); auto add = std::make_shared( - cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); + cn, TensorLayout{{no, ho, wo, co}, dtype::Float32()}); std::memcpy( reinterpret_cast(x->raw_ptr()), conv_input_cpu_data.data(), conv_input_count * sizeof(int8_t)); @@ -517,13 +524,13 @@ TEST(TestMagicMindRuntimeOpr, InputQInt8) { reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; - HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); - HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o1(cn, {no, ho, wo, co}, dtype::Float32()); + HostTensorND o2(cn, {no, ho, wo, co}, dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); - HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), - o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o1_mm(cn, {no, ho, wo, co}, dtype::Float32()), + o2_mm(cn, {no, ho, wo, co}, dtype::Float32()); std::memcpy( o1_mm.ptr(), relu_output_cpu_data.data(), relu_output_count * sizeof(float)); @@ -591,9 +598,9 @@ TEST(TestMagicMindRuntimeOpr, GraphShapeMutable) { add_output_mlu_ptr, mlu_deleter}; network.infer_model( - {conv_input_mlu_ptr, add_output_mlu_ptr}, + {conv_input_mlu_ptr, add_input_mlu_ptr}, {relu_output_mlu_ptr, add_output_mlu_ptr}, - {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); + {Dims{{ni, hi, wi, ci}}, Dims{{no, ho, wo, co}}}); // result memory copy cnml->cpu // memory copy cpu->mlu @@ -607,11 +614,11 @@ TEST(TestMagicMindRuntimeOpr, GraphShapeMutable) { auto buf = network.get_serialized_model(true); auto mkshp = [](int n, int c, int h, int w) { size_t nz = n, cz = c, hz = h, wz = w; - return TensorShape{nz, cz, hz, wz}; + return TensorShape{nz, hz, wz, cz}; }; auto mkly = [](int n, int c, int h, int w, DType dtype) { size_t nz = n, cz = c, hz = h, wz = w; - return TensorLayout{{nz, cz, hz, wz}, dtype}; + return TensorLayout{{nz, hz, wz, cz}, dtype}; }; auto x = std::make_shared( cn, mkly(ni, ci, hi, wi, dtype::Float32())); @@ -662,9 +669,9 @@ TEST(TestMagicMindRuntimeOpr, Serialization) { const int ni = 1, ci = 64, hi = 32, wi = 32; const int no = 1, co = 64, ho = 32, wo = 32; auto x = std::make_shared( - cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); + cn, TensorLayout{{ni, hi, wi, ci}, dtype::Float32()}); auto add = std::make_shared( - cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); + cn, TensorLayout{{no, ho, wo, co}, dtype::Float32()}); auto graph = ComputingGraph::make(); auto x_ = opr::Host2DeviceCopy::make(*graph, x); auto add_ = opr::Host2DeviceCopy::make(*graph, add); @@ -693,11 +700,11 @@ TEST(TestMagicMindRuntimeOpr, Profiling) { MMNetwork network(cn, magicmind::DataType::FLOAT32, true); auto buf = network.get_serialized_model(false); const int ni = 8, ci = 64, hi = 32, wi = 32; - const int no = 1, co = 64, ho = 32, wo = 32; + const int no = 8, co = 64, ho = 32, wo = 32; HostTensorGenerator gen(0, 1); - auto x = gen({ni, ci, hi, wi}, cn); - auto add = gen({no, co, ho, wo}, cn); + auto x = gen({ni, hi, wi, ci}, cn); + auto add = gen({no, ho, wo, co}, cn); auto graph = ComputingGraph::make(); GraphProfiler profiler{graph.get()}; @@ -708,8 +715,8 @@ TEST(TestMagicMindRuntimeOpr, Profiling) { auto out1 = outs[0]; auto out2 = outs[1]; graph->options().var_sanity_check_first_run = false; - HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); - HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o1(cn, {no, ho, wo, co}, dtype::Float32()); + HostTensorND o2(cn, {no, ho, wo, co}, dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); @@ -768,9 +775,9 @@ TEST(TestMagicMindRuntimeOpr, CrossCNCopy) { add_output_mlu_ptr, mlu_deleter}; network.infer_model( - {conv_input_mlu_ptr, add_output_mlu_ptr}, + {conv_input_mlu_ptr, add_input_mlu_ptr}, {relu_output_mlu_ptr, add_output_mlu_ptr}, - {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); + {Dims{{ni, hi, wi, ci}}, Dims{{no, ho, wo, co}}}); // result memory copy cnml->cpu // memory copy cpu->mlu @@ -784,9 +791,9 @@ TEST(TestMagicMindRuntimeOpr, CrossCNCopy) { auto cn_cpu = CompNode::load("cpu0"); auto buf = network.get_serialized_model(false); auto x = std::make_shared( - cn_cpu, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); + cn_cpu, TensorLayout{{ni, hi, wi, ci}, dtype::Float32()}); auto add = std::make_shared( - cn_cpu, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); + cn_cpu, TensorLayout{{no, ho, wo, co}, dtype::Float32()}); std::memcpy( reinterpret_cast(x->ptr()), conv_input_cpu_data.data(), conv_input_count * sizeof(float)); @@ -802,13 +809,13 @@ TEST(TestMagicMindRuntimeOpr, CrossCNCopy) { reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; - HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); - HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o1(CompNode::default_cpu(), {no, ho, wo, co}, dtype::Float32()); + HostTensorND o2(CompNode::default_cpu(), {no, ho, wo, co}, dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); - HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), - o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o1_mm(cn, {no, ho, wo, co}, dtype::Float32()), + o2_mm(cn, {no, ho, wo, co}, dtype::Float32()); std::memcpy( o1_mm.ptr(), relu_output_cpu_data.data(), relu_output_count * sizeof(float)); diff --git a/src/core/include/megbrain/ir/ops.td b/src/core/include/megbrain/ir/ops.td index d500f532c79d0dcf94ee5140350db3b7bcb9768b..96d1de37c5ada66ab33a3730ec23ce866c0dec7b 100644 --- a/src/core/include/megbrain/ir/ops.td +++ b/src/core/include/megbrain/ir/ops.td @@ -388,6 +388,13 @@ def CambriconRuntime: MgbHashableOp<"CambriconRuntime"> { ); } +def MagicMindRuntime: MgbHashableOp<"MagicMindRuntime"> { + let extraArguments = (ins + MgbStringAttr:$buf, + MgbSizeTAddr:$buf_size + ); +} + def CvtColor: MgbHashableOp<"CvtColor", [CvtColorParam]>; def CheckNonFinite: MgbHashableOp<"CheckNonFinite", [EmptyParam]>;