diff --git a/src/core/impl/comp_node/cpu/comp_node.cpp b/src/core/impl/comp_node/cpu/comp_node.cpp index 6ec87c6cbdc516a9f7c37209ea285c5bb2374f0d..3beab27c9bde432cd54f648b30c13396ff619cc9 100644 --- a/src/core/impl/comp_node/cpu/comp_node.cpp +++ b/src/core/impl/comp_node/cpu/comp_node.cpp @@ -306,11 +306,37 @@ public: m_env.cpu_env().dispatch(do_copy); } + void copy_to_host_ref( + megdnn::RefPtr& host_ref_ptr, megdnn::RefPtr& device_ref_ptr, + size_t size) override { + // use lambda capture to avoid memory allocation in std::bind + auto do_copy = [host_ref_ptr, device_ref_ptr, size]() { + std::memcpy(host_ref_ptr.get_ptr(), device_ref_ptr.get_ptr(), size); + }; + m_env.cpu_env().dispatch(do_copy); + } + + void copy_to_device_ref( + megdnn::RefPtr& device_ref_ptr, megdnn::RefPtr& host_ref_ptr, + size_t size) override { + // use lambda capture to avoid memory allocation in std::bind + auto do_copy = [device_ref_ptr, host_ref_ptr, size]() { + std::memcpy(device_ref_ptr.get_ptr(), host_ref_ptr.get_ptr(), size); + }; + m_env.cpu_env().dispatch(do_copy); + } + void peer_copy_to( Impl* dest_impl, void* dest, const void* src, size_t size) override { dest_impl->copy_to_device(dest, src, size); } + void peer_copy_to_ref( + Impl* dest_impl, megdnn::RefPtr& dest, megdnn::RefPtr& src, + size_t size) override { + dest_impl->copy_to_device_ref(dest, src, size); + } + size_t get_mem_addr_alignment() override { return m_env.property().mem_alignment; } void dispatch(Task&& task) override { m_env.cpu_env().dispatch(std::move(task)); } @@ -733,6 +759,24 @@ public: CompNodeBaseImpl::copy_to_device(device_ptr, host_ptr, size); } + void copy_to_host_ref( + megdnn::RefPtr& host_ref_ptr, megdnn::RefPtr& device_ref_ptr, + size_t size) override { + if (m_worker_queue) { + m_worker_queue->check_exception(); + } + CompNodeBaseImpl::copy_to_host_ref(host_ref_ptr, device_ref_ptr, size); + } + + void copy_to_device_ref( + megdnn::RefPtr& device_ref_ptr, megdnn::RefPtr& host_ref_ptr, + size_t size) override { + if (m_worker_queue) { + m_worker_queue->check_exception(); + } + CompNodeBaseImpl::copy_to_device_ref(device_ref_ptr, host_ref_ptr, size); + } + void peer_copy_to( Impl* dest_impl, void* dest, const void* src, size_t size) override { //! copy to default_cpu @@ -774,6 +818,48 @@ public: dest_impl->copy_to_device(dest, src, size); } + void peer_copy_to_ref( + Impl* dest_impl, megdnn::RefPtr& dest, megdnn::RefPtr& src, + size_t size) override { + //! copy to default_cpu + if (dest_impl->same_type()) { + CompNodeBaseImpl::peer_copy_to_ref(dest_impl, dest, src, size); + return; + } + + if (!dest_impl->same_type()) { + if (dest_impl->env().property().type == DeviceType::ATLAS) { +#if MGB_ATLAS + dest_impl->copy_to_device(dest.get_ptr(), src.get_ptr(), size); + return; +#else + mgb_throw( + MegBrainError, + "Atlas comp_node used but " + "ATLAS BUILD not enabled"); +#endif + } else if (dest_impl->env().property().type == DeviceType::CAMBRICON) { +#if MGB_CAMBRICON + dest_impl->copy_to_device(dest.get_ptr(), src.get_ptr(), size); + return; +#else + mgb_throw( + MegBrainError, + "Cambricon comp_node used but " + "CAMBRICON BUILD not enabled"); +#endif + } + else { + mgb_assert( + locator().device == Locator::DEVICE_CPU_DEFAULT, + "currently only peer copy from default cpu comp " + "nodes " + "is implemented"); + } + } + dest_impl->copy_to_device_ref(dest, src, size); + } + std::unique_ptr create_event(size_t flags) override { if (m_worker_queue) { m_worker_queue->check_exception(); diff --git a/src/core/impl/graph/var_node_mem_mgr.cpp b/src/core/impl/graph/var_node_mem_mgr.cpp index 2af9e1e6c9299457a32a768294241a0b4dec760b..96a2707256c24005932756814f6d09f99ee6c02a 100644 --- a/src/core/impl/graph/var_node_mem_mgr.cpp +++ b/src/core/impl/graph/var_node_mem_mgr.cpp @@ -81,9 +81,8 @@ const DeviceTensorStorage& StaticDeviceMemoryManager::alloc( void StaticDeviceMemoryManager::prefault() { for (auto&& i : m_storage) { if (i.first.device_type() == CompNode::DeviceType::CPU) { - auto set = [ptr = i.second.ptr(), size = i.second.size()]() { - memset(ptr, 0, size); - }; + auto storage = i.second; + auto set = [storage]() { memset(storage.ptr(), 0, storage.size()); }; CompNodeEnv::from_comp_node(i.first).cpu_env().dispatch(set); i.first.sync(); } diff --git a/src/core/impl/tensor.cpp b/src/core/impl/tensor.cpp index a58bb84e7e83623d4798c74a5bba5d89923c80f3..0939ee05b0af44ecab47c996a49e72983e6305d9 100644 --- a/src/core/impl/tensor.cpp +++ b/src/core/impl/tensor.cpp @@ -379,7 +379,9 @@ MGE_WIN_DECLSPEC_FUC void TensorStorage::copy_from( need_sync = true; } } - src.comp_node().copy_to_host(ptr(), src.ptr(), size); + megdnn::RefPtr src_ptr(src.get_ref_ptr(), src.offset(), false); + megdnn::RefPtr dst_ptr(get_ref_ptr(), offset(), false); + src.comp_node().copy_to_host_ref(dst_ptr, src_ptr, size); if (need_sync) src.comp_node().sync(); } @@ -390,7 +392,9 @@ template <> MGE_WIN_DECLSPEC_FUC void TensorStorage::copy_from( const TensorStorage& src, size_t size) const { mgb_assert(size <= this->size() && size <= src.size()); - m_comp_node.copy_to_device(ptr(), src.ptr(), size); + megdnn::RefPtr src_ptr(src.get_ref_ptr(), src.offset(), false); + megdnn::RefPtr dst_ptr(get_ref_ptr(), offset(), false); + m_comp_node.copy_to_device_ref(dst_ptr, src_ptr, size); } // device to device @@ -417,9 +421,13 @@ MGE_WIN_DECLSPEC_FUC void TensorStorage::copy_from( // to pin the memory of src tensor, so it does not require synchronization // and is more efficient src.comp_node().sync(); - comp_node().copy_to_device(ptr(), src.ptr(), size); + megdnn::RefPtr src_ptr(src.get_ref_ptr(), src.offset(), false); + megdnn::RefPtr dst_ptr(get_ref_ptr(), offset(), false); + comp_node().copy_to_device_ref(dst_ptr, src_ptr, size); } else { - src.comp_node().peer_copy_to(m_comp_node, ptr(), src.ptr(), size); + megdnn::RefPtr src_ptr(src.get_ref_ptr(), src.offset(), false); + megdnn::RefPtr dst_ptr(get_ref_ptr(), offset(), false); + src.comp_node().peer_copy_to_ref(m_comp_node, dst_ptr, src_ptr, size); } } @@ -712,32 +720,34 @@ const typename TensorND::ChainReturnType& TensorND< void mgb::dev_tensor_memset(const DeviceTensorND& tensor, int val) { auto&& env = CompNodeEnv::from_comp_node(tensor.comp_node()); env.activate(); - void* ptr = tensor.raw_ptr(); size_t size = tensor.layout().span().dist_byte(); switch (env.property().type) { #if MGB_CUDA case CompNode::DeviceType::CUDA: - MGB_CUDA_CHECK(cudaMemsetAsync(ptr, val, size, env.cuda_env().stream)); + MGB_CUDA_CHECK(cudaMemsetAsync( + tensor.raw_ptr(), val, size, env.cuda_env().stream)); break; #endif #if MGB_ATLAS case CompNode::DeviceType::ATLAS: #if MGB_USE_ATLAS_ASYNC_API - MGB_ATLAS_CHECK( - aclrtMemsetAsync(ptr, -1, val, size, env.atlas_env().stream)); + MGB_ATLAS_CHECK(aclrtMemsetAsync( + tensor.raw_ptr(), -1, val, size, env.atlas_env().stream)); #else - MGB_ATLAS_CHECK(aclrtMemset(ptr, -1, val, size)); + MGB_ATLAS_CHECK(aclrtMemset(tensor.raw_ptr(), -1, val, size)); #endif break; #endif #if MGB_CAMBRICON case CompNode::DeviceType::CAMBRICON: MGB_CNRT_CHECK(cnrtSyncQueue(env.cnrt_env().queue)); - MGB_CNRT_CHECK(cnrtMemset(ptr, val, size)); + MGB_CNRT_CHECK(cnrtMemset(tensor.raw_ptr(), val, size)); break; #endif case CompNode::DeviceType::CPU: { - auto fill = [ptr, size, val]() { std::memset(ptr, val, size); }; + auto fill = [tensor, size, val]() { + std::memset(tensor.as_megdnn().raw_ptr(), val, size); + }; env.cpu_env().dispatch(fill); } break; default: diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h index dfd52193bc3289749838b6dc213f41f415e04426..56a281f7c6d58929afd433a5538f113cc6871435 100644 --- a/src/core/include/megbrain/comp_node.h +++ b/src/core/include/megbrain/comp_node.h @@ -242,6 +242,20 @@ public: return m_impl->copy_to_device(device_ptr, host_ptr, size); } + //! copy from underlying device to host + void copy_to_host_ref( + megdnn::RefPtr& host_ref_ptr, megdnn::RefPtr& device_ref_ptr, + size_t size) const { + return m_impl->copy_to_host_ref(host_ref_ptr, device_ref_ptr, size); + } + + //! copy from host to underlying device + void copy_to_device_ref( + megdnn::RefPtr& device_ref_ptr, megdnn::RefPtr& host_ref_ptr, + size_t size) const { + return m_impl->copy_to_device_ref(device_ref_ptr, host_ref_ptr, size); + } + /*! * \brief copy from this device to another device; would use the * computing resource on dest_node @@ -253,6 +267,14 @@ public: reinterpret_cast(dest_node.m_impl), dest, src, size); } + void peer_copy_to_ref( + CompNode dest_node, megdnn::RefPtr& dst_ref_ptr, + megdnn::RefPtr& src_ref_ptr, size_t size) const { + return m_impl->peer_copy_to_ref( + reinterpret_cast(dest_node.m_impl), dst_ref_ptr, src_ref_ptr, + size); + } + //! get alignment requiement in bytes; guaranteed to be power of 2 size_t get_mem_addr_alignment() const { return m_impl->get_mem_addr_alignment(); } @@ -517,9 +539,25 @@ protected: void* host_ptr, const void* device_ptr, size_t size) = 0; virtual void copy_to_device( void* device_ptr, const void* host_ptr, size_t size) = 0; + virtual void copy_to_host_ref( + megdnn::RefPtr& host_ref_ptr, megdnn::RefPtr& device_ref_ptr, + size_t size) { + copy_to_host(host_ref_ptr.get_ptr(), device_ref_ptr.get_ptr(), size); + } + virtual void copy_to_device_ref( + megdnn::RefPtr& device_ref_ptr, megdnn::RefPtr& host_ref_ptr, + size_t size) { + copy_to_device(device_ref_ptr.get_ptr(), host_ref_ptr.get_ptr(), size); + } virtual void peer_copy_to( Impl* dest_impl, void* dest, const void* src, size_t size) = 0; + virtual void peer_copy_to_ref( + Impl* dest_impl, megdnn::RefPtr& dest, megdnn::RefPtr& src, + size_t size) { + peer_copy_to(dest_impl, dest.get_ptr(), src.get_ptr(), size); + } + virtual size_t get_mem_addr_alignment() = 0; virtual size_t get_mem_padding(); diff --git a/src/gopt/test/network.cpp b/src/gopt/test/network.cpp index 73b3e86871fc1bf4de44a4c11aa5b08f0809bd7a..7c264ecc7eadd7bb38703f827a89394fee96d190 100644 --- a/src/gopt/test/network.cpp +++ b/src/gopt/test/network.cpp @@ -100,6 +100,10 @@ SymbolVar Network::add_type_cvt(SymbolVar f, DType out_dtype) { return opr::TypeCvt::make(f, out_dtype); } +SymbolVar Network::add_concat(SymbolVar f, SymbolVar g, int axis) { + return opr::Concat::make({f, g}, axis); +} + SymbolVar mgb::create_block( Network& network, SymbolVar f_in, size_t stride, size_t num_outputs1, bool has_proj, DType out_dtype) { diff --git a/src/gopt/test/network.h b/src/gopt/test/network.h index 011d30346c7672fd54d7d5436220fd98bcfdbb4f..a0018e57a848a3747dadee19e399817588c85798 100644 --- a/src/gopt/test/network.h +++ b/src/gopt/test/network.h @@ -60,6 +60,7 @@ public: Padding padding = {0, 0}, opr::Pooling::Param::Mode mode = opr::Pooling::Param::Mode::MAX); SymbolVar add_type_cvt(SymbolVar f, DType out_dtype = dtype::Float32()); + SymbolVar add_concat(SymbolVar f, SymbolVar g, int axis = 0); }; SymbolVar create_block( diff --git a/src/gopt/test/no_memory_copy.cpp b/src/gopt/test/no_memory_copy.cpp index 4bd0b53dd0fa1c013c48b77672e4a13fbfc01e0c..09d2469c7961c42e88668362b237c2da6081fb71 100644 --- a/src/gopt/test/no_memory_copy.cpp +++ b/src/gopt/test/no_memory_copy.cpp @@ -41,7 +41,8 @@ struct TestGraph { f = m_network->add_elemwise( {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); - m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); + f = m_network->add_pooling(f, {2, 2}, {2, 2}); + m_out_var = m_network->add_concat(f, -f); } void create_graph_with_subtensor_forward() { @@ -63,7 +64,8 @@ struct TestGraph { f = m_network->add_elemwise( {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); - m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); + f = m_network->add_pooling(f, {2, 2}, {2, 2}); + m_out_var = m_network->add_concat(f, -f); } void create_graph_with_subtensor_relayout() { @@ -86,7 +88,8 @@ struct TestGraph { f = m_network->add_elemwise( {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); - m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); + f = m_network->add_pooling(f, {2, 2}, {2, 2}); + m_out_var = m_network->add_concat(f, -f); } void create_graph_with_setsubtensor() { @@ -113,7 +116,8 @@ struct TestGraph { f = m_network->add_elemwise( {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); - m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); + f = m_network->add_pooling(f, {2, 2}, {2, 2}); + m_out_var = m_network->add_concat(f, -f); } std::unique_ptr compile_without_copy() { @@ -173,8 +177,8 @@ TEST(TestNoCopy, IONoCopyPtrEQ) { test_graph.create_graph(); auto func = test_graph.compile_without_copy(); auto&& outvar = func->get_output_vars()[0]; - DeviceTensorND dv0(test_graph.m_cn, {1, 8, 7, 7}); - DeviceTensorND dv1(test_graph.m_cn, {1, 8, 7, 7}); + DeviceTensorND dv0(test_graph.m_cn, {2, 8, 7, 7}); + DeviceTensorND dv1(test_graph.m_cn, {2, 8, 7, 7}); size_t times = 10; for (size_t i = 0; i < times; i++) { auto input_tensor = test_graph.input_tensor; @@ -229,7 +233,7 @@ TEST(TestNoCopy, IONoCopyCorrect) { ptr[d] = i / 5 + 3; } input_tensor->reset(storage, layout); - DeviceTensorND dv(test_graph.m_cn, {1, 8, 7, 7}); + DeviceTensorND dv(test_graph.m_cn, {2, 8, 7, 7}); outvar->init_mem_plan(&dv); outvar->reset_dev_tensor_from_tensor(dv); @@ -258,7 +262,7 @@ TEST(TestNoCopy, IONoCopyRecord) { HostTensorND truth; auto func = test_graph.compile_without_copy(); auto&& outvar = func->get_output_vars()[0]; - DeviceTensorND tmp(test_graph.m_cn, {1, 8, 7, 7}); + DeviceTensorND tmp(test_graph.m_cn, {2, 8, 7, 7}); outvar->init_mem_plan(&tmp); size_t times = 10; for (size_t i = 0; i < times; i++) { @@ -272,7 +276,7 @@ TEST(TestNoCopy, IONoCopyRecord) { ptr[d] = i / 5 + 3; } input_tensor->only_reset_raw_storage(storage); - DeviceTensorND dv(test_graph.m_cn, {1, 8, 7, 7}); + DeviceTensorND dv(test_graph.m_cn, {2, 8, 7, 7}); dv.raw_ptr(); auto& dev_tensor = outvar->mutable_dev_tensor(); @@ -306,7 +310,7 @@ void test_subtensor_record(int level) { HostTensorND truth; auto func = test_graph.compile_without_copy(); auto&& outvar = func->get_output_vars()[0]; - DeviceTensorND tmp(test_graph.m_cn, {1, 8, 7, 7}); + DeviceTensorND tmp(test_graph.m_cn, {2, 8, 7, 7}); outvar->init_mem_plan(&tmp); size_t times = 10; for (size_t i = 0; i < times; i++) { @@ -320,7 +324,7 @@ void test_subtensor_record(int level) { ptr[d] = i / 5 + 3; } input_tensor->only_reset_raw_storage(storage); - DeviceTensorND dv(test_graph.m_cn, {1, 8, 7, 7}); + DeviceTensorND dv(test_graph.m_cn, {2, 8, 7, 7}); dv.raw_ptr(); auto& dev_tensor = outvar->mutable_dev_tensor(); diff --git a/src/opr/impl/standalone/nms_opr.cpp b/src/opr/impl/standalone/nms_opr.cpp index 9d5fa3749b8e3711e7a717c2ac4f510c801d9842..8e212fd255ce9a2209019d36862c466f4f862675 100644 --- a/src/opr/impl/standalone/nms_opr.cpp +++ b/src/opr/impl/standalone/nms_opr.cpp @@ -139,11 +139,9 @@ void NMSKeep::CPUKern::exec( // See CUDAKern::exec for more explanation on output comp nodes. CompNode comp_node = out_idx.comp_node(); - auto inp_ptr = inp.ptr(); - auto out_idx_ptr = reinterpret_cast(out_idx.ptr()), - out_size_ptr = reinterpret_cast(out_size.ptr()); size_t batch = inp.shape(0), nr_boxes = inp.shape(1); if (nr_boxes == 0) { + auto out_size_ptr = reinterpret_cast(out_size.ptr()); for (size_t i = 0; i < batch; ++i) { *(out_size_ptr + i) = 0; } @@ -157,6 +155,11 @@ void NMSKeep::CPUKern::exec( // be dispatched on a different thread auto kern = [=]() { for (size_t i = 0; i < batch; ++i) { + auto inp_ptr = inp.as_megdnn().ptr(); + auto out_idx_ptr = + reinterpret_cast(out_idx.as_megdnn().ptr()); + auto out_size_ptr = + reinterpret_cast(out_size.as_megdnn().ptr()); nms::cpu_kern( nr_boxes, param.max_output, param.iou_thresh, inp_ptr + i * nr_boxes * 4, out_idx_ptr + i * param.max_output,