From a5609f3bbfc64c18b04e2eeec3d6484d08f63ac9 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 3 Sep 2020 12:27:47 +0800 Subject: [PATCH] fix(cambricon): fix cross cn copy for cambricon GitOrigin-RevId: 21942a82a36df7017e6fc72a4a0f4c3419c77488 --- src/cambricon/test/cambricon_runtime_opr.cpp | 87 ++++++++++++++++++++ src/core/impl/comp_node/cpu/comp_node.cpp | 31 +++++-- src/core/impl/tensor.cpp | 8 +- 3 files changed, 118 insertions(+), 8 deletions(-) diff --git a/src/cambricon/test/cambricon_runtime_opr.cpp b/src/cambricon/test/cambricon_runtime_opr.cpp index e33e321f5..b3beb7350 100644 --- a/src/cambricon/test/cambricon_runtime_opr.cpp +++ b/src/cambricon/test/cambricon_runtime_opr.cpp @@ -11,6 +11,7 @@ #include "megbrain/comp_node_env.h" #include "megbrain/opr/io.h" +#include "megbrain/opr/basic_arith.h" #include "megbrain/plugin/profiler.h" #include "megbrain/serialization/serializer.h" #include "megbrain/test/helper.h" @@ -557,6 +558,92 @@ TEST(TestCambriconRuntimeOpr, Profiling) { profiler.to_json_full(func.get()) ->writeto_fpath(output_file("cambricon_runtime_opr_profile.json")); } + +TEST(TestCambriconRuntimeOpr, CrossCNCopy) { + REQUIRE_CAMBRICON_DEVICE(1); + auto cn = CompNode::load("cambricon0"); + CnmlModelContext ctx{cn, true}; + + // prepare parameter for addpad and conv + size_t ni = 16, ci = 64, hi = 32, wi = 32; + size_t no = 16, co = 64, ho = 32, wo = 32; + + // count tensor nums + int conv_input_count = ni * hi * wi * ci; + int relu_output_count = no * ho * wo * co; + + // prepare cpu origin data + std::vector conv_input_cpu_data(conv_input_count); + std::vector relu_output_cpu_data(relu_output_count); + + // prepare input data for addpad + unsigned int seed = time(0); + for (int index = 0; index < conv_input_count; ++index) { + conv_input_cpu_data[index] = ((rand_r(&seed) % 100 / 100.0) - 0.5) / 2; + } + + // prepare cpu data to converts to mlu memory + std::vector conv_input_cpu(conv_input_count); + std::vector relu_output_cpu(relu_output_count); + MGB_CNRT_CHECK(cnrtCastDataType(conv_input_cpu_data.data(), CNRT_FLOAT32, + conv_input_cpu.data(), CNRT_FLOAT16, + conv_input_count, nullptr)); + + auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; + void* input_mlu_ptr; + void* output_mlu_ptr; + + // malloc mlu mem for fusion input and output + MGB_CNRT_CHECK( + cnrtMalloc(&input_mlu_ptr, conv_input_count * sizeof(int16_t))); + MGB_CNRT_CHECK( + cnrtMalloc(&output_mlu_ptr, relu_output_count * sizeof(int16_t))); + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy(input_mlu_ptr, conv_input_cpu.data(), + conv_input_count * sizeof(int16_t), + CNRT_MEM_TRANS_DIR_HOST2DEV)); + std::unique_ptr input_holder{input_mlu_ptr, + mlu_deleter}; + std::unique_ptr output_holder{output_mlu_ptr, + mlu_deleter}; + + ctx.do_inference(&input_mlu_ptr, &output_mlu_ptr); + + // result memory copy cnml->cpu + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy(relu_output_cpu.data(), output_mlu_ptr, + relu_output_count * sizeof(int16_t), + CNRT_MEM_TRANS_DIR_DEV2HOST)); + MGB_CNRT_CHECK(cnrtCastDataType(relu_output_cpu.data(), CNRT_FLOAT16, + relu_output_cpu_data.data(), CNRT_FLOAT32, + relu_output_count, nullptr)); + auto cn_cpu = CompNode::load("cpu0"); + // cnml inference finished + auto buf = ctx.get_serialized_model(); + std::shared_ptr input = std::make_shared( + cn_cpu, TensorLayout{{ni, ci, hi, wi}, dtype::Float16()}); + memcpy(reinterpret_cast(input->ptr()), + conv_input_cpu.data(), conv_input_count * sizeof(int16_t)); + auto graph = ComputingGraph::make(); + auto host_x = opr::Host2DeviceCopy::make(*graph, input, {cn_cpu}); + auto x = opr::Copy::make(host_x, {cn}); + auto y = opr::CambriconRuntimeOpr::make(buf.data(), buf.size(), "subnet0", + {x}, true)[0]; + HostTensorND output(CompNode::default_cpu(), {no, co, ho, wo}, + dtype::Float16()); + auto func = graph->compile({make_callback_copy(y, output)}); + func->execute(); + HostTensorND out_cnml(cn_cpu, {no, co, ho, wo}, dtype::Float32()), + out_mgb(cn_cpu, {no, co, ho, wo}, dtype::Float32()); + memcpy(out_cnml.ptr(), relu_output_cpu_data.data(), + relu_output_count * sizeof(float)); + MGB_CNRT_CHECK( + cnrtCastDataType(reinterpret_cast(output.ptr()), + CNRT_FLOAT16, out_mgb.ptr(), CNRT_FLOAT32, + relu_output_count, nullptr)); + MGB_ASSERT_TENSOR_NEAR(out_cnml, out_mgb, 1e-4); +} + #endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/core/impl/comp_node/cpu/comp_node.cpp b/src/core/impl/comp_node/cpu/comp_node.cpp index 227557ff6..38e4b4de2 100644 --- a/src/core/impl/comp_node/cpu/comp_node.cpp +++ b/src/core/impl/comp_node/cpu/comp_node.cpp @@ -397,7 +397,16 @@ class CpuCompNode::CompNodeImpl final: public CpuDispatchableBase { "Atlas comp_node used but " "MGB_ATLAS not enabled"); #endif - + } else if (dest_impl->env().property().type == + DeviceType::CAMBRICON) { +#if MGB_CAMBRICON + dest_impl->copy_to_device(dest, src, size); + return; +#else + mgb_throw(MegBrainError, + "Cambricon comp_node used but " + "MGB_CAMBRICON not enabled"); +#endif } else { mgb_assert(locator().device == Locator::DEVICE_CPU_DEFAULT, @@ -912,12 +921,13 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by( { auto type = cn_impl->env().property().type; - mgb_throw_if(type != CompNode::DeviceType::CPU - && type != CompNode::DeviceType::CUDA - && type != CompNode::DeviceType::ATLAS - , - MegBrainError, - "currently CPU can only wait for CPU, CUDA, ATLAS" + mgb_throw_if( + type != CompNode::DeviceType::CPU && + type != CompNode::DeviceType::CUDA + && type != CompNode::DeviceType::ATLAS && + type != CompNode::DeviceType::CAMBRICON, + MegBrainError, + "currently CPU can only wait for CPU, CUDA, ATLAS, CAMBRICON" ); } @@ -928,6 +938,13 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by( mgb_throw(MegBrainError, "Atlas comp_node used but MGB_ATLAS not enabled"); #endif + } else if (cn_impl->env().property().type == CompNode::DeviceType::CAMBRICON) { +#if MGB_CAMBRICON + return m_comp_node_impl->sync(); +#else + mgb_throw(MegBrainError, + "Cambricon comp_node used but MGB_CAMBRICON not enabled"); +#endif } diff --git a/src/core/impl/tensor.cpp b/src/core/impl/tensor.cpp index e6c94322f..b2d1b148f 100644 --- a/src/core/impl/tensor.cpp +++ b/src/core/impl/tensor.cpp @@ -677,7 +677,13 @@ void mgb::dev_tensor_memset(const DeviceTensorND& tensor, int val) { #endif break; #endif - case CompNode::DeviceType::CPU: { +#if MGB_CAMBRICON + case CompNode::DeviceType::CAMBRICON: + MGB_CNRT_CHECK(cnrtSyncQueue(env.cnrt_env().queue)); + MGB_CNRT_CHECK(cnrtMemset(ptr, val, size)); + break; +#endif + case CompNode::DeviceType::CPU: { auto fill = [ptr, size, val]() { std::memset(ptr, val, size); }; env.cpu_env().dispatch(fill); } break; -- GitLab