/** * \file src/cambricon/test/magicmind_runtime_opr.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ #include "megbrain/comp_node_env.h" #include "megbrain/opr/basic_arith.h" #include "megbrain/opr/io.h" #include "megbrain/plugin/profiler.h" #include "megbrain/serialization/serializer.h" #include "megbrain/test/helper.h" #if MGB_CAMBRICON #include "megbrain/cambricon/magicmind_runtime_opr.h" #include "interface_builder.h" #include "interface_network.h" using namespace mgb; using namespace opr; using namespace magicmind; namespace { template void gen_rand_data(std::vector& data, size_t num_elems, size_t scale) { unsigned int seed = time(0); data.resize(num_elems); for (size_t i = 0; i < num_elems; ++i) { data[i] = static_cast((rand_r(&seed) % (scale * 1000)) / 1000.0 - scale / 2.0); } } template void get_min_max(std::vector& data, double& min, double& max) { min = *std::min_element(data.begin(), data.end()); max = *std::max_element(data.begin(), data.end()); } void cast_data_type( std::vector& input, void* output, size_t size, cnrtDataType_t input_type, cnrtDataType_t output_type, double& min, double& max) { cnrtQuantizedParam_t param = NULL; if (output_type == CNRT_INT8 || output_type == CNRT_INT16) { get_min_max(input, min, max); int bitwidth = 8; if (output_type == CNRT_INT8) { bitwidth = 8; } else if (output_type == CNRT_INT16) { bitwidth = 16; } auto par_tmp = magicmind::RangeToUniformQuantParamWithQuantAlg( {min, max}, bitwidth, "symmetric"); auto par = magicmind::UniformToNormalCast(par_tmp); MGB_CNRT_CHECK(cnrtCreateQuantizedParam(¶m, par.pos, par.scale, 0)); } MGB_CNRT_CHECK(cnrtCastDataType( reinterpret_cast(input.data()), input_type, output, output_type, size, param)); } cnrtDataType_t convert_data_type(magicmind::DataType dtype) { static const std::unordered_map dtype_map = { #define cb(dt_mm_, dt_cnrt_) {magicmind::DataType::dt_mm_, CNRT_##dt_cnrt_} cb(QINT8, INT8), cb(QINT16, INT16), cb(INT8, INT8), cb(INT16, INT16), cb(INT32, INT32), cb(UINT8, UINT8), cb(FLOAT16, FLOAT16), cb(FLOAT32, FLOAT32), }; auto it = dtype_map.find(dtype); if (it != dtype_map.end()) return it->second; else { mgb_assert( false, "unsupported magicmind dtype(%u).", static_cast(dtype)); } } ///! taken from src/jit/impl/utils.cpp void replace_all_pairs_inplace( std::string& text, const std::vector>& replace) { using str = std::string; auto repl_one = [&text](const str& from, const str& to) { mgb_assert(!from.empty()); size_t pos = 0; while ((pos = text.find(from, pos)) != str::npos) { text.replace(pos, from.size(), to); pos += to.size(); } }; for (auto&& i : replace) { repl_one(i.first, i.second); } } class MMNetwork { public: template using MagicMindUniquePtr = magicmind_intl::MagicMindUniquePtr; using IModelPtr = MagicMindRuntimeOpr::IModelPtr; using IContextPtr = MagicMindRuntimeOpr::IContextPtr; using IEnginePtr = MagicMindRuntimeOpr::IEnginePtr; const CompNode& cn_; magicmind::DataType op_datatype_; IModelPtr model_; bool graph_shape_mutable_; bool built_; template static MagicMindUniquePtr make_mm_unique_ptr(T* ptr) { return {ptr, magicmind_intl::MagicMindDeleter()}; } MMNetwork( const CompNode& cn, magicmind::DataType op_datatype, bool graph_shape_mutable = false) : cn_{cn}, op_datatype_{op_datatype}, model_{nullptr}, graph_shape_mutable_{graph_shape_mutable}, built_{false} {} void build() { auto&& cnrt_env = CompNodeEnv::from_comp_node(cn_).cnrt_env(); cnrt_env.activate(); constexpr int ni = 16, ci = 64, hi = 32, wi = 32; constexpr int no = 16, co = 64, ho = 32, wo = 32; constexpr int kh = 3, kw = 3; constexpr int stride_h = 1, stride_w = 1; constexpr int pad_h = 1, pad_w = 1; magicmind::Dims input_dim{{ni, ci, hi, wi}}; magicmind::Dims filter_dim{{co, ci, kh, kw}}; magicmind::Dims bias_dim{{co}}; magicmind::Dims add_dim{{no, co, ho, wo}}; magicmind::DataType output_datatype = magicmind::DataType::FLOAT32; // init auto builder = make_mm_unique_ptr(magicmind::CreateIBuilder()); auto config = make_mm_unique_ptr(magicmind::CreateIBuilderConfig()); std::string user_json_config = R"( { "graph_shape_mutable": {{GRAPH_SHAPE_MUTABLE}}, "precision_config": { "precision_mode": "qint8_mixed_float16" } } )"; replace_all_pairs_inplace( user_json_config, {{"{{GRAPH_SHAPE_MUTABLE}}", std::to_string(graph_shape_mutable_)}}); config->ParseFromString(user_json_config); auto network = make_mm_unique_ptr(magicmind::CreateINetwork()); magicmind::Range filter_range = {0.0f, 0.0f}; // create input tensor auto init_tensor = [](magicmind::ITensor* tensor, const std::string& name, const Dims& input_dim) { magicmind::Range input_range = {0.0f, 0.0f}; std::vector temp_buffer; gen_rand_data(temp_buffer, input_dim.GetElementCount(), 256); get_min_max(temp_buffer, input_range.min, input_range.max); MM_CHECK(tensor->SetDynamicRange(input_range, false)); tensor->SetTensorName(name); }; auto input_tensor = network->AddInput(op_datatype_, input_dim); init_tensor(input_tensor, "x", input_dim); auto add_tensor = network->AddInput(output_datatype, add_dim); init_tensor(add_tensor, "add", add_dim); // create filter tensor magicmind::ITensor* filter_tensor = nullptr; { std::vector filter_buf; gen_rand_data(filter_buf, filter_dim.GetElementCount(), 1); std::vector filter_buf_intx; filter_buf_intx.resize( filter_dim.GetElementCount() * magicmind::DataTypeSize(op_datatype_)); cast_data_type( filter_buf, reinterpret_cast(filter_buf_intx.data()), filter_dim.GetElementCount(), CNRT_FLOAT32, convert_data_type(op_datatype_), filter_range.min, filter_range.max); auto filter = network->AddIConstNode( op_datatype_, filter_dim, reinterpret_cast(filter_buf_intx.data())); filter_tensor = filter->GetOutput(0); filter_tensor->SetDynamicRange(filter_range, false); } // create bias tensor magicmind::ITensor* bias_tensor = nullptr; { std::vector bias_buf; gen_rand_data(bias_buf, bias_dim.GetElementCount(), 1); std::vector bias_buf_floatx; if (output_datatype == magicmind::DataType::FLOAT16) { bias_buf_floatx.resize( bias_dim.GetElementCount() * magicmind::DataTypeSize(output_datatype)); double min = 0., max = 0.; cast_data_type( bias_buf, reinterpret_cast(bias_buf_floatx.data()), bias_dim.GetElementCount(), CNRT_FLOAT32, convert_data_type(output_datatype), min, max); auto bias = network->AddIConstNode( output_datatype, bias_dim, reinterpret_cast(bias_buf_floatx.data())); bias_tensor = bias->GetOutput(0); } else { auto bias = network->AddIConstNode( output_datatype, bias_dim, reinterpret_cast(bias_buf.data())); bias_tensor = bias->GetOutput(0); } } // x w bias // \ / | // | / // conv // | // relu ------ out1 // \ add // \ / // | // out2 // create conv + relu node auto conv = network->AddIConvNode(input_tensor, filter_tensor, bias_tensor); MM_CHECK(conv->SetStride(stride_h, stride_w)); MM_CHECK(conv->SetPad(pad_h, pad_w, pad_h, pad_w)); MM_CHECK(conv->SetDilation(1, 1)); MM_CHECK(conv->SetPaddingMode(magicmind::IPaddingMode::EXPLICIT)); auto conv_output = conv->GetOutput(0); // conv output tensor datatype should be set same with bias tensor MM_CHECK(conv->SetOutputType(0, output_datatype)); // relu output tensor datatype will be same with input tensor auto relu = network->AddIActivationNode(conv_output, magicmind::IActivation::RELU); MM_CHECK(relu->SetOutputType(0, output_datatype)); relu->GetOutput(0)->SetTensorName("out1"); // set outputs nodes MM_CHECK(network->MarkOutput(relu->GetOutput(0))); // create elemwise add auto add = network->AddIElementwiseNode( relu->GetOutput(0), add_tensor, magicmind::IElementwise::ADD); add->GetOutput(0)->SetTensorName("out2"); MM_CHECK(network->MarkOutput(add->GetOutput(0))); // create model model_ = { builder->BuildModel("model", network.get(), config.get()), magicmind_intl::MagicMindDeleter()}; mgb_assert(model_ != nullptr); built_ = true; } const IModelPtr& get_inference_model() { if (!built_) build(); return model_; } std::string get_serialized_model(bool serialize_to_file) { if (!built_) build(); size_t size = 0; MM_CHECK(model_->GetSerializedModelSize(&size)); std::string buf; buf.resize(size); MM_CHECK(model_->SerializeToMemory(reinterpret_cast(buf.data()), size)); if (serialize_to_file) { std::string fname = ssprintf( "./output/MagicMindRuntimeOprTest.%s.mlu", graph_shape_mutable_ ? "GraphShapeMutable" : "GraphShapeImmutableBatch"); model_->SerializeToFile(fname.c_str()); } return buf; } void infer_model( const std::vector& inputs, const std::vector& outputs, const std::vector& input_dims) { if (!built_) build(); auto&& cnrt_env = CompNodeEnv::from_comp_node(cn_).cnrt_env(); cnrt_env.activate(); auto engine = make_mm_unique_ptr(model_->CreateIEngine()); mgb_assert(engine != nullptr); auto context = make_mm_unique_ptr(engine->CreateIContext()); mgb_assert(context != nullptr); // create and get irttensor from context std::vector input_tensors; std::vector output_tensors; MM_CHECK(CreateInputTensors(context.get(), &input_tensors)); MM_CHECK(CreateOutputTensors(context.get(), &output_tensors)); MM_CHECK(FindIRTTensorByName(input_tensors, "x")->SetDimensions(input_dims[0])); MM_CHECK(FindIRTTensorByName(input_tensors, "add") ->SetDimensions(input_dims[1])); MM_CHECK(context->InferOutputShape(input_tensors, output_tensors)); MM_CHECK(FindIRTTensorByName(input_tensors, "x")->SetData(inputs[0])); MM_CHECK(FindIRTTensorByName(input_tensors, "add")->SetData(inputs[1])); MM_CHECK(FindIRTTensorByName(output_tensors, "out1")->SetData(outputs[0])); MM_CHECK(FindIRTTensorByName(output_tensors, "out2")->SetData(outputs[1])); auto&& queue = cnrt_env.queue; cnrtNotifier_t start, end; MGB_CNRT_CHECK(cnrtCreateNotifier(&start)); MGB_CNRT_CHECK(cnrtCreateNotifier(&end)); MGB_CNRT_CHECK(cnrtPlaceNotifier(start, queue)); constexpr size_t runs = 50; for (size_t i = 0; i < runs; ++i) { MM_CHECK(context->Enqueue(input_tensors, output_tensors, queue)); } MGB_CNRT_CHECK(cnrtPlaceNotifier(end, queue)); MGB_CNRT_CHECK(cnrtSyncQueue(queue)); float time = 0.f; MGB_CNRT_CHECK(cnrtNotifierDuration(start, end, &time)); printf("inference time = %.2fs\n", time / static_cast(runs) * 1e-3); MGB_CNRT_CHECK(cnrtDestroyNotifier(&start)); MGB_CNRT_CHECK(cnrtDestroyNotifier(&end)); } }; } // namespace TEST(TestMagicMindRuntimeOpr, Basic) { REQUIRE_CAMBRICON_DEVICE(1); auto cn = CompNode::load("cambricon0"); MMNetwork network(cn, magicmind::DataType::FLOAT32, false); size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32); // prepare parameter for addpad and conv const int ni = 16, ci = 64, hi = 32, wi = 32; const int no = 16, co = 64, ho = 32, wo = 32; // count tensor nums int conv_input_count = ni * hi * wi * ci; int relu_output_count = no * ho * wo * co; // prepare cpu origin data std::vector conv_input_cpu_data; gen_rand_data(conv_input_cpu_data, conv_input_count, 256); std::vector add_input_cpu_data; gen_rand_data(add_input_cpu_data, relu_output_count, 256); std::vector relu_output_cpu_data(relu_output_count); std::vector add_output_cpu_data(relu_output_count); auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; void* conv_input_mlu_ptr; void* add_input_mlu_ptr; void* relu_output_mlu_ptr; void* add_output_mlu_ptr; // malloc mlu mem for fusion input and output MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size)); MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float))); MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float))); MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float))); // memory copy cpu->mlu MGB_CNRT_CHECK(cnrtMemcpy( conv_input_mlu_ptr, conv_input_cpu_data.data(), conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV)); MGB_CNRT_CHECK(cnrtMemcpy( add_input_mlu_ptr, add_input_cpu_data.data(), relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV)); std::unique_ptr conv_input_holder{ conv_input_mlu_ptr, mlu_deleter}; std::unique_ptr add_input_holder{ add_input_mlu_ptr, mlu_deleter}; std::unique_ptr relu_output_holder{ relu_output_mlu_ptr, mlu_deleter}; std::unique_ptr add_output_holder{ add_output_mlu_ptr, mlu_deleter}; network.infer_model( {conv_input_mlu_ptr, add_output_mlu_ptr}, {relu_output_mlu_ptr, add_output_mlu_ptr}, {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); // result memory copy cnml->cpu // memory copy cpu->mlu MGB_CNRT_CHECK(cnrtMemcpy( relu_output_cpu_data.data(), relu_output_mlu_ptr, relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); MGB_CNRT_CHECK(cnrtMemcpy( add_output_cpu_data.data(), add_output_mlu_ptr, relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); auto buf = network.get_serialized_model(false); auto x = std::make_shared( cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); auto add = std::make_shared( cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); std::memcpy( reinterpret_cast(x->ptr()), conv_input_cpu_data.data(), conv_input_count * sizeof(float)); std::memcpy( reinterpret_cast(add->ptr()), add_input_cpu_data.data(), relu_output_count * sizeof(float)); auto graph = ComputingGraph::make(); auto x_ = opr::Host2DeviceCopy::make(*graph, x); auto add_ = opr::Host2DeviceCopy::make(*graph, add); auto outs = opr::MagicMindRuntimeOpr::make( reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); std::memcpy( o1_mm.ptr(), relu_output_cpu_data.data(), relu_output_count * sizeof(float)); std::memcpy( o2_mm.ptr(), add_output_cpu_data.data(), relu_output_count * sizeof(float)); MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4); MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4); } TEST(TestMagicMindRuntimeOpr, InputQInt8) { REQUIRE_CAMBRICON_DEVICE(1); auto cn = CompNode::load("cambricon0"); MMNetwork network(cn, magicmind::DataType::QINT8, false); size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::QINT8); // prepare parameter for addpad and conv const int ni = 16, ci = 64, hi = 32, wi = 32; const int no = 16, co = 64, ho = 32, wo = 32; // count tensor nums int conv_input_count = ni * hi * wi * ci; int relu_output_count = no * ho * wo * co; // prepare cpu origin data std::vector conv_input_cpu_data; gen_rand_data(conv_input_cpu_data, conv_input_count, 256); std::vector add_input_cpu_data; gen_rand_data(add_input_cpu_data, relu_output_count, 256); std::vector relu_output_cpu_data(relu_output_count); std::vector add_output_cpu_data(relu_output_count); auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; void* conv_input_mlu_ptr; void* add_input_mlu_ptr; void* relu_output_mlu_ptr; void* add_output_mlu_ptr; // malloc mlu mem for fusion input and output MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size)); MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float))); MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float))); MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float))); // memory copy cpu->mlu MGB_CNRT_CHECK(cnrtMemcpy( conv_input_mlu_ptr, conv_input_cpu_data.data(), conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV)); MGB_CNRT_CHECK(cnrtMemcpy( add_input_mlu_ptr, add_input_cpu_data.data(), relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV)); std::unique_ptr conv_input_holder{ conv_input_mlu_ptr, mlu_deleter}; std::unique_ptr add_input_holder{ add_input_mlu_ptr, mlu_deleter}; std::unique_ptr relu_output_holder{ relu_output_mlu_ptr, mlu_deleter}; std::unique_ptr add_output_holder{ add_output_mlu_ptr, mlu_deleter}; network.infer_model( {conv_input_mlu_ptr, add_output_mlu_ptr}, {relu_output_mlu_ptr, add_output_mlu_ptr}, {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); // result memory copy cnml->cpu // memory copy cpu->mlu MGB_CNRT_CHECK(cnrtMemcpy( relu_output_cpu_data.data(), relu_output_mlu_ptr, relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); MGB_CNRT_CHECK(cnrtMemcpy( add_output_cpu_data.data(), add_output_mlu_ptr, relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); auto buf = network.get_serialized_model(false); auto x = std::make_shared( cn, TensorLayout{{ni, ci, hi, wi}, dtype::QuantizedS8{1.f}}); auto add = std::make_shared( cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); std::memcpy( reinterpret_cast(x->raw_ptr()), conv_input_cpu_data.data(), conv_input_count * sizeof(int8_t)); std::memcpy( reinterpret_cast(add->ptr()), add_input_cpu_data.data(), relu_output_count * sizeof(float)); auto graph = ComputingGraph::make(); auto x_ = opr::Host2DeviceCopy::make(*graph, x); auto add_ = opr::Host2DeviceCopy::make(*graph, add); auto outs = opr::MagicMindRuntimeOpr::make( reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); std::memcpy( o1_mm.ptr(), relu_output_cpu_data.data(), relu_output_count * sizeof(float)); std::memcpy( o2_mm.ptr(), add_output_cpu_data.data(), relu_output_count * sizeof(float)); MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4); MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4); } TEST(TestMagicMindRuntimeOpr, GraphShapeMutable) { REQUIRE_CAMBRICON_DEVICE(1); auto cn = CompNode::load("cambricon0"); MMNetwork network(cn, magicmind::DataType::FLOAT32, true); size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32); auto check = [&](magicmind::Dims input_dim, magicmind::Dims output_dim) { // prepare parameter for addpad and conv const int ni = input_dim[0], ci = input_dim[1], hi = input_dim[2], wi = input_dim[3]; const int no = output_dim[0], co = output_dim[1], ho = output_dim[2], wo = output_dim[3]; // count tensor nums int conv_input_count = ni * hi * wi * ci; int relu_output_count = no * ho * wo * co; // prepare cpu origin data std::vector conv_input_cpu_data; gen_rand_data(conv_input_cpu_data, conv_input_count, 256); std::vector add_input_cpu_data; gen_rand_data(add_input_cpu_data, relu_output_count, 256); std::vector relu_output_cpu_data(relu_output_count); std::vector add_output_cpu_data(relu_output_count); auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; void* conv_input_mlu_ptr; void* add_input_mlu_ptr; void* relu_output_mlu_ptr; void* add_output_mlu_ptr; // malloc mlu mem for fusion input and output MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size)); MGB_CNRT_CHECK( cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float))); MGB_CNRT_CHECK( cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float))); MGB_CNRT_CHECK( cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float))); // memory copy cpu->mlu MGB_CNRT_CHECK(cnrtMemcpy( conv_input_mlu_ptr, conv_input_cpu_data.data(), conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV)); MGB_CNRT_CHECK(cnrtMemcpy( add_input_mlu_ptr, add_input_cpu_data.data(), relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV)); std::unique_ptr conv_input_holder{ conv_input_mlu_ptr, mlu_deleter}; std::unique_ptr add_input_holder{ add_input_mlu_ptr, mlu_deleter}; std::unique_ptr relu_output_holder{ relu_output_mlu_ptr, mlu_deleter}; std::unique_ptr add_output_holder{ add_output_mlu_ptr, mlu_deleter}; network.infer_model( {conv_input_mlu_ptr, add_output_mlu_ptr}, {relu_output_mlu_ptr, add_output_mlu_ptr}, {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); // result memory copy cnml->cpu // memory copy cpu->mlu MGB_CNRT_CHECK(cnrtMemcpy( relu_output_cpu_data.data(), relu_output_mlu_ptr, relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); MGB_CNRT_CHECK(cnrtMemcpy( add_output_cpu_data.data(), add_output_mlu_ptr, relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); auto buf = network.get_serialized_model(true); auto mkshp = [](int n, int c, int h, int w) { size_t nz = n, cz = c, hz = h, wz = w; return TensorShape{nz, cz, hz, wz}; }; auto mkly = [](int n, int c, int h, int w, DType dtype) { size_t nz = n, cz = c, hz = h, wz = w; return TensorLayout{{nz, cz, hz, wz}, dtype}; }; auto x = std::make_shared( cn, mkly(ni, ci, hi, wi, dtype::Float32())); auto add = std::make_shared( cn, mkly(no, co, ho, wo, dtype::Float32())); std::memcpy( reinterpret_cast(x->ptr()), conv_input_cpu_data.data(), conv_input_count * sizeof(float)); std::memcpy( reinterpret_cast(add->ptr()), add_input_cpu_data.data(), relu_output_count * sizeof(float)); auto graph = ComputingGraph::make(); auto x_ = opr::Host2DeviceCopy::make(*graph, x); auto add_ = opr::Host2DeviceCopy::make(*graph, add); auto outs = opr::MagicMindRuntimeOpr::make( reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; HostTensorND o1(cn, mkshp(no, co, ho, wo), dtype::Float32()); HostTensorND o2(cn, mkshp(no, co, ho, wo), dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); HostTensorND o1_mm(cn, mkshp(no, co, ho, wo), dtype::Float32()), o2_mm(cn, mkshp(no, co, ho, wo), dtype::Float32()); std::memcpy( o1_mm.ptr(), relu_output_cpu_data.data(), relu_output_count * sizeof(float)); std::memcpy( o2_mm.ptr(), add_output_cpu_data.data(), relu_output_count * sizeof(float)); MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4); MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4); }; check(Dims{{1, 64, 32, 32}}, Dims{{1, 64, 32, 32}}); check(Dims{{32, 64, 32, 32}}, Dims{{32, 64, 32, 32}}); check(Dims{{7, 64, 16, 16}}, Dims{{7, 64, 16, 16}}); } TEST(TestMagicMindRuntimeOpr, Serialization) { using namespace serialization; REQUIRE_CAMBRICON_DEVICE(1); auto cn = CompNode::load("cambricon0"); MMNetwork network(cn, magicmind::DataType::FLOAT32, true); auto buf = network.get_serialized_model(false); // prepare parameter for addpad and conv const int ni = 1, ci = 64, hi = 32, wi = 32; const int no = 1, co = 64, ho = 32, wo = 32; auto x = std::make_shared( cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); auto add = std::make_shared( cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); auto graph = ComputingGraph::make(); auto x_ = opr::Host2DeviceCopy::make(*graph, x); auto add_ = opr::Host2DeviceCopy::make(*graph, add); auto outs = opr::MagicMindRuntimeOpr::make( reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; auto fname = output_file("MagicMindRuntimeOprTest"); auto dump = [&]() { auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str())); auto rst = dumper->dump({out1, out2}); ASSERT_EQ(rst.outputs.size(), 2u); }; auto load = [&]() { auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str())); auto rst = loader->load(); ASSERT_EQ(rst.output_var_list.size(), 2u); }; dump(); load(); } TEST(TestMagicMindRuntimeOpr, Profiling) { REQUIRE_CAMBRICON_DEVICE(1); auto cn = CompNode::load("cambricon0"); MMNetwork network(cn, magicmind::DataType::FLOAT32, true); auto buf = network.get_serialized_model(false); const int ni = 8, ci = 64, hi = 32, wi = 32; const int no = 1, co = 64, ho = 32, wo = 32; HostTensorGenerator gen(0, 1); auto x = gen({ni, ci, hi, wi}, cn); auto add = gen({no, co, ho, wo}, cn); auto graph = ComputingGraph::make(); GraphProfiler profiler{graph.get()}; auto x_ = opr::Host2DeviceCopy::make(*graph, x); auto add_ = opr::Host2DeviceCopy::make(*graph, add); auto outs = opr::MagicMindRuntimeOpr::make( reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; graph->options().var_sanity_check_first_run = false; HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); profiler.to_json_full(func.get()) ->writeto_fpath(output_file("magicmind_runtime_opr_profile.json")); } TEST(TestMagicMindRuntimeOpr, CrossCNCopy) { REQUIRE_CAMBRICON_DEVICE(1); auto cn = CompNode::load("cambricon0"); MMNetwork network(cn, magicmind::DataType::FLOAT32, false); size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32); // prepare parameter for addpad and conv const int ni = 16, ci = 64, hi = 32, wi = 32; const int no = 16, co = 64, ho = 32, wo = 32; // count tensor nums int conv_input_count = ni * hi * wi * ci; int relu_output_count = no * ho * wo * co; // prepare cpu origin data std::vector conv_input_cpu_data; gen_rand_data(conv_input_cpu_data, conv_input_count, 256); std::vector add_input_cpu_data; gen_rand_data(add_input_cpu_data, relu_output_count, 256); std::vector relu_output_cpu_data(relu_output_count); std::vector add_output_cpu_data(relu_output_count); auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; void* conv_input_mlu_ptr; void* add_input_mlu_ptr; void* relu_output_mlu_ptr; void* add_output_mlu_ptr; // malloc mlu mem for fusion input and output MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size)); MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float))); MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float))); MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float))); // memory copy cpu->mlu MGB_CNRT_CHECK(cnrtMemcpy( conv_input_mlu_ptr, conv_input_cpu_data.data(), conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV)); MGB_CNRT_CHECK(cnrtMemcpy( add_input_mlu_ptr, add_input_cpu_data.data(), relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV)); std::unique_ptr conv_input_holder{ conv_input_mlu_ptr, mlu_deleter}; std::unique_ptr add_input_holder{ add_input_mlu_ptr, mlu_deleter}; std::unique_ptr relu_output_holder{ relu_output_mlu_ptr, mlu_deleter}; std::unique_ptr add_output_holder{ add_output_mlu_ptr, mlu_deleter}; network.infer_model( {conv_input_mlu_ptr, add_output_mlu_ptr}, {relu_output_mlu_ptr, add_output_mlu_ptr}, {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); // result memory copy cnml->cpu // memory copy cpu->mlu MGB_CNRT_CHECK(cnrtMemcpy( relu_output_cpu_data.data(), relu_output_mlu_ptr, relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); MGB_CNRT_CHECK(cnrtMemcpy( add_output_cpu_data.data(), add_output_mlu_ptr, relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); auto cn_cpu = CompNode::load("cpu0"); auto buf = network.get_serialized_model(false); auto x = std::make_shared( cn_cpu, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); auto add = std::make_shared( cn_cpu, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); std::memcpy( reinterpret_cast(x->ptr()), conv_input_cpu_data.data(), conv_input_count * sizeof(float)); std::memcpy( reinterpret_cast(add->ptr()), add_input_cpu_data.data(), relu_output_count * sizeof(float)); auto graph = ComputingGraph::make(); auto x_ = opr::Host2DeviceCopy::make(*graph, x, {cn_cpu}); auto add_ = opr::Host2DeviceCopy::make(*graph, add, {cn_cpu}); x_ = opr::Copy::make(x_, {cn}); add_ = opr::Copy::make(add_, {cn}); auto outs = opr::MagicMindRuntimeOpr::make( reinterpret_cast(buf.data()), buf.size(), {x_, add_}); auto out1 = outs[0]; auto out2 = outs[1]; HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); auto func = graph->compile( {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); func->execute(); HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); std::memcpy( o1_mm.ptr(), relu_output_cpu_data.data(), relu_output_count * sizeof(float)); std::memcpy( o2_mm.ptr(), add_output_cpu_data.data(), relu_output_count * sizeof(float)); MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4); MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4); } #endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}