diff --git a/lite/include/lite/network.h b/lite/include/lite/network.h index 1bf26b82459f8cb4cba07a2c27c89b8d05ada816..3fa6fbefcb70b229f05a332d9c91861244f4e26d 100644 --- a/lite/include/lite/network.h +++ b/lite/include/lite/network.h @@ -93,6 +93,7 @@ struct LITE_API Options { bool const_shape = false; bool force_dynamic_alloc = false; bool force_output_dynamic_alloc = false; + bool force_output_use_user_specified_memory = false; bool no_profiling_on_shape_change = false; uint8_t jit_level = 0; uint8_t comp_node_seq_record_level = 0; diff --git a/lite/lite-c/include/lite-c/network_c.h b/lite/lite-c/include/lite-c/network_c.h index ff838d0e815e2bfd53182c63fc6b24f08aff9246..df01fdb0c8c882c05d6f5e03438ff2326281be0d 100644 --- a/lite/lite-c/include/lite-c/network_c.h +++ b/lite/lite-c/include/lite-c/network_c.h @@ -83,6 +83,7 @@ typedef struct Options { int const_shape; int force_dynamic_alloc; int force_output_dynamic_alloc; + int force_output_use_user_specified_memory; int no_profiling_on_shape_change; int jit_level; int comp_node_seq_record_level; diff --git a/lite/lite-c/src/network.cpp b/lite/lite-c/src/network.cpp index 51df08cf99c10316d36628fff4013afb1af11542..49fb94248b66de39c74d6108f707b69e0c430583 100644 --- a/lite/lite-c/src/network.cpp +++ b/lite/lite-c/src/network.cpp @@ -29,6 +29,7 @@ const LiteOptions default_option = { .const_shape = false, .force_dynamic_alloc = false, .force_output_dynamic_alloc = false, + .force_output_use_user_specified_memory = false, .no_profiling_on_shape_change = false, .jit_level = 0, .comp_node_seq_record_level = 0, @@ -122,7 +123,9 @@ lite::Config convert_to_lite_config(const LiteConfig c_config) { lite_config.options.var_sanity_check_first_run = c_config.options.var_sanity_check_first_run; lite_config.options.const_shape = c_config.options.const_shape; - lite_config.options.force_dynamic_alloc = c_config.options.const_shape; + lite_config.options.force_dynamic_alloc = c_config.options.force_dynamic_alloc; + lite_config.options.force_output_use_user_specified_memory = + c_config.options.force_output_use_user_specified_memory; lite_config.options.force_output_dynamic_alloc = c_config.options.force_output_dynamic_alloc; lite_config.options.no_profiling_on_shape_change = diff --git a/lite/pylite/megenginelite/network.py b/lite/pylite/megenginelite/network.py index 7324f48d142c13b39958e77b677b2c3a3f6a93ea..1106079acd4fa064986eaeaabb8addd749576efc 100644 --- a/lite/pylite/megenginelite/network.py +++ b/lite/pylite/megenginelite/network.py @@ -29,6 +29,7 @@ class LiteOptions(Structure): ("const_shape", c_int), ("force_dynamic_alloc", c_int), ("force_output_dynamic_alloc", c_int), + ("force_output_use_user_specified_memory", c_int), ("no_profiling_on_shape_change", c_int), ("jit_level", c_int), ("comp_node_seq_record_level", c_int), @@ -52,6 +53,7 @@ class LiteOptions(Structure): self.const_shape = False self.force_dynamic_alloc = False self.force_output_dynamic_alloc = False + self.force_output_use_user_specified_memory = False self.no_profiling_on_shape_change = False self.jit_level = 0 self.comp_node_seq_record_level = 0 @@ -67,6 +69,7 @@ class LiteOptions(Structure): "const_shape": bool(self.const_shape), "force_dynamic_alloc": bool(self.force_dynamic_alloc), "force_output_dynamic_alloc": bool(self.force_output_dynamic_alloc), + "force_output_nocopy": bool(self.force_output_nocopy), "no_profiling_on_shape_change": bool(self.no_profiling_on_shape_change), "jit_level": self.jit_level, "comp_node_seq_record_level": self.comp_node_seq_record_level, diff --git a/lite/src/mge/network_impl.cpp b/lite/src/mge/network_impl.cpp index 247c486afb132ab492fca1a67ea94ef38ad2e1fa..1cfde0fa2ad63ff69d3a63f16f67bf036599f3da 100644 --- a/lite/src/mge/network_impl.cpp +++ b/lite/src/mge/network_impl.cpp @@ -84,6 +84,9 @@ void NetworkImplDft::application_config() { m_load_config.const_var_shape = m_user_config->options.const_shape; ConfigOption(force_dynamic_alloc, force_dynamic_alloc); ConfigOption(force_output_dynamic_alloc, force_output_dynamic_alloc); + ConfigOption( + force_output_use_user_specified_memory, + force_output_use_user_specified_memory); ConfigOption(no_profiling_on_shape_change, no_profiling_on_shape_change); LITE_ASSERT( m_user_config->options.jit_level == 0 || @@ -250,7 +253,13 @@ void NetworkImplDft::make_output_spec() { } } }; - m_output_spec.emplace_back(load_out, std::move(cb)); + //! if write to user-specified memory, the CallbackCaller must be nullptr. + if (m_user_config->options.force_output_use_user_specified_memory || + m_user_config->options.force_output_dynamic_alloc) { + m_output_spec.emplace_back(load_out, nullptr); + } else { + m_output_spec.emplace_back(load_out, std::move(cb)); + } } else { LITE_THROW(ssprintf("no output named : %s in the mode", out.name.c_str())); } @@ -444,8 +453,7 @@ void NetworkImplDft::set_io(const NetworkIO& network_io) { } } -void NetworkImplDft::try_infer_tensor_layout( - std::shared_ptr tensor, mgb::cg::SymbolVar var) { +void NetworkImplDft::try_infer_tensor_layout(std::shared_ptr tensor, Var var) { auto&& static_infer_mgr = m_load_config.comp_graph->static_infer_manager(); auto infer_trait = var.node()->get_static_infer_trait(); if (std::get<0>(infer_trait)) { @@ -455,9 +463,13 @@ void NetworkImplDft::try_infer_tensor_layout( "Lite infer output shape failed, maybe the model is " "dynamic " "shape.\n"); + LITE_ASSERT( + !m_user_config->options.force_output_use_user_specified_memory, + "force_output_use_user_specified_memory can't be used when output " + "shape can't be derived."); return; } - Layout layout = to_lite_layout(mgb::TensorLayout{*shape, var.dtype()}); + Layout layout = to_lite_layout(TensorLayout{*shape, var.dtype()}); tensor->set_layout(layout); } } @@ -559,8 +571,7 @@ void NetworkImplDft::update_output() { out_it != m_network_io->outputs.end();) { if (std::find_if( m_load_result.output_var_list.begin(), - m_load_result.output_var_list.end(), - [out_it](const mgb::SymbolVar var) { + m_load_result.output_var_list.end(), [out_it](const SymbolVar var) { return var.node()->name() == out_it->name; }) == m_load_result.output_var_list.end()) { LITE_LOG("%s is not the network output, ignore it.", out_it->name.c_str()); @@ -584,7 +595,7 @@ void NetworkImplDft::update_output() { out_it->lite_tensor = std::make_shared(device_id, stream_id, device_type); } - mgb::SymbolVar var; + SymbolVar var; for (auto&& out_var : m_load_result.output_var_list) { if (out_var.node()->name() == out_it->name) { var = out_var; @@ -592,10 +603,12 @@ void NetworkImplDft::update_output() { } } try_infer_tensor_layout(out_it->lite_tensor, var); + output_tensor_copy_optimize(var, out_it->lite_tensor); } //! user not set, use default output } else { for (auto&& out : m_load_result.output_var_list) { + std::shared_ptr lite_tensor = nullptr; auto it = std::find_if( m_network_io->outputs.begin(), m_network_io->outputs.end(), [&out](const IOInner io) { return io.name == out.node()->name(); }); @@ -608,6 +621,7 @@ void NetworkImplDft::update_output() { std::make_shared(device_id, stream_id, device_type); } try_infer_tensor_layout(it->lite_tensor, out); + lite_tensor = it->lite_tensor; } else { IOInner output; output.name = out.node()->name(); @@ -615,11 +629,47 @@ void NetworkImplDft::update_output() { device_id, stream_id, device_type, true); m_network_io->outputs.push_back({output}); try_infer_tensor_layout(output.lite_tensor, out); + lite_tensor = output.lite_tensor; } + output_tensor_copy_optimize(out, lite_tensor); } } } +void NetworkImplDft::output_tensor_copy_optimize( + Var var, std::shared_ptr tensor) { + LITE_ASSERT( + !(m_user_config->options.force_output_use_user_specified_memory && + m_user_config->options.force_output_dynamic_alloc), + "Can't set force_output_use_user_specified_memory and " + "force_output_dynamic_alloc at the same time."); + if (m_user_config->options.force_output_use_user_specified_memory) { + TensorHelper::implement(tensor) + ->cast_final_safe() + .set_reset_callback([var](TensorImplDft* dft_tensor) { + dft_tensor->device_share_host_memory(); + auto dv = dft_tensor->dev_tensor().get(); + dv->comp_node(var.node()->comp_node(), true); + var.node()->init_mem_plan(dv); + var.node()->reset_dev_tensor_from_tensor(*dv); + }); + } + if (m_user_config->options.force_output_dynamic_alloc) { + TensorHelper::implement(tensor) + ->cast_final_safe() + .set_get_memory_callback([var](TensorImplDft* dft_tensor) { + if (dft_tensor->is_host()) { + auto host_tensor = dft_tensor->m_host_tensor; + *host_tensor = + HostTensorND::make_proxy(var.node()->dev_tensor()); + } else { + auto dev_tensor = dft_tensor->m_dev_tensor; + *dev_tensor = var.node()->dev_tensor(); + } + }); + } +} + std::shared_ptr NetworkImplDft::get_io_tensor( std::string io_name, LiteTensorPhase phase) { if (phase == LiteTensorPhase::LITE_INPUT || phase == LiteTensorPhase::LITE_IO) { diff --git a/lite/src/mge/network_impl.h b/lite/src/mge/network_impl.h index 098bb7fb170ed7b213ede00e2cc73c4ec9307221..903f92f02141921403a8677493e6da347cfde52d 100644 --- a/lite/src/mge/network_impl.h +++ b/lite/src/mge/network_impl.h @@ -12,6 +12,7 @@ #pragma once #include "lite_build_config.h" +#include "megbrain/graph.h" #if LITE_BUILD_WITH_MGE #include "lite/network.h" @@ -41,6 +42,7 @@ class NetworkImplDft final : public Network::NetworkImplBase { public: NetworkImplDft() { m_load_config.comp_graph = mgb::ComputingGraph::make(); } using S = megdnn::param::ExecutionPolicy::Strategy; + using Var = mgb::cg::SymbolVar; //! set the config of the network, include: //! the inference device //! the other inference options, such as record_level, weight_preprocess... @@ -207,8 +209,10 @@ private: void compile_graph(); //! try to infer output tensor layout - void try_infer_tensor_layout( - std::shared_ptr tensor, mgb::cg::SymbolVar var); + void try_infer_tensor_layout(std::shared_ptr tensor, Var var); + + //! optimized output tensor copy + void output_tensor_copy_optimize(Var var, std::shared_ptr tensor); private: bool m_async = false; diff --git a/lite/src/mge/tensor_impl.cpp b/lite/src/mge/tensor_impl.cpp index d9fc7ce8f9c6cc28ebf0611865b4491f90c97a61..63d38ec70f0246a55b8fb257b64f58e7eadcc543 100644 --- a/lite/src/mge/tensor_impl.cpp +++ b/lite/src/mge/tensor_impl.cpp @@ -149,6 +149,9 @@ Layout TensorImplDft::get_layout() const { } void* TensorImplDft::get_memory_ptr() const { + if (m_get_memory_callback) { + m_get_memory_callback(const_cast(this)); + } if (is_host()) { return static_cast(m_host_tensor->raw_ptr()); } else { @@ -157,6 +160,9 @@ void* TensorImplDft::get_memory_ptr() const { } void* TensorImplDft::get_memory_ptr(const std::vector& idx) const { + if (m_get_memory_callback) { + m_get_memory_callback(const_cast(this)); + } if (is_host()) { auto elemsize_log = m_host_tensor->layout().dtype.size_log(); switch (elemsize_log) { @@ -317,6 +323,9 @@ void TensorImplDft::reset(void* prepared_data) { storage.reset(cn, size, raw_storage); m_dev_tensor->reset(storage, mge_layout); } + if (m_reset_callback) { + m_reset_callback(this); + } } void TensorImplDft::reset(void* prepared_data, const Layout& layout) { @@ -430,6 +439,34 @@ void TensorImplDft::copy_from_mge_tensor(const mgb::DeviceTensorND& dv) { } } +void TensorImplDft::set_reset_callback(const std::function& cb) { + m_reset_callback = cb; +} + +void TensorImplDft::set_get_memory_callback( + const std::function& cb) { + m_get_memory_callback = cb; +} + +void TensorImplDft::device_share_host_memory() { + if (is_host()) { + if (!m_dev_tensor) { + m_dev_tensor = std::make_shared( + m_host_tensor->comp_node(), m_host_tensor->layout()); + } + if (m_host_tensor->raw_ptr() != m_dev_tensor->raw_ptr()) { + auto raw_storage = std::shared_ptr( + m_host_tensor->raw_ptr(), [](void*) {}); + auto cn = m_host_tensor->comp_node(); + auto mge_layout = m_host_tensor->layout(); + size_t size = mge_layout.span().dist_byte(); + mgb::DeviceTensorStorage storage; + storage.reset(cn, size, raw_storage); + m_dev_tensor->reset(storage, mge_layout); + } + } +} + #endif // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/lite/src/mge/tensor_impl.h b/lite/src/mge/tensor_impl.h index 17f125323efa6209c04ceed1b9ecbbe418ca3679..a33e2bf77fb79acafb255d8623d044a00e6349ba 100644 --- a/lite/src/mge/tensor_impl.h +++ b/lite/src/mge/tensor_impl.h @@ -97,11 +97,22 @@ public: //! get host tensor std::shared_ptr host_tensor() const { return m_host_tensor; } + //! get device tensor std::shared_ptr dev_tensor() const { return m_dev_tensor; } + //! copy from mgb tensor void copy_from_mge_tensor(const mgb::DeviceTensorND& dv); + //! set tensor reset callback + void set_reset_callback(const std::function& cb); + + //! set tensor get memory callback + void set_get_memory_callback(const std::function& cb); + + //! shared the same memory with host and device tensor + void device_share_host_memory(); + public: friend class NetworkImplDft; @@ -115,6 +126,8 @@ private: void set_mge_tensor_compnode(const mgb::CompNode& comp_node); private: + std::function m_get_memory_callback; + std::function m_reset_callback; std::shared_ptr m_host_tensor; std::shared_ptr m_dev_tensor; }; diff --git a/lite/src/network.cpp b/lite/src/network.cpp index 745dbe5d23e0aa7b49653cbabec79b6b1bc27615..abe1db3037022c845344fae254879de550770395 100644 --- a/lite/src/network.cpp +++ b/lite/src/network.cpp @@ -153,6 +153,10 @@ std::shared_ptr Network::get_output_tensor(size_t index) { Network& Network::set_async_callback(const AsyncCallback& callback) { LITE_ERROR_HANDLER_BEGIN + LITE_ASSERT( + !m_config.options.force_output_use_user_specified_memory, + "Async mode can't run with force_output_use_user_specified_memory which " + "output data is written to use specific memory."); LITE_CHECK_NON_NULL_POINTER(m_impl); m_impl->set_async_callback(std::move(callback)); return *this; diff --git a/lite/test/test_network.cpp b/lite/test/test_network.cpp index 617f8055e4d515ecc72c7590445dd3a80fcf1ee7..f786b92c1e2b05df84b7121417b1a09395c1c148 100644 --- a/lite/test/test_network.cpp +++ b/lite/test/test_network.cpp @@ -397,6 +397,73 @@ TEST(TestNetWork, ResetOutput) { compare_lite_tensor(output_tensor, result_mgb); } +TEST(TestNetWork, OutputNoCopy) { + Config config; + config.options.force_output_use_user_specified_memory = true; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + size_t times = 5; + std::vector> result_tensors; + for (size_t i = 0; i < times; i++) { + auto tmp = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + result_tensors.push_back(tmp); + } + + for (size_t i = 0; i < times; i++) { + void* out_data = result_tensors[i]->get_memory_ptr(); + output_tensor->reset(out_data, result_tensors[i]->get_layout()); + + network->forward(); + network->wait(); + ASSERT_EQ(output_tensor->get_memory_ptr(), out_data); + compare_lite_tensor(output_tensor, result_mgb); + } + for (size_t i = 0; i < times; i++) { + compare_lite_tensor(result_tensors[i], result_mgb); + } +} + +TEST(TestNetWork, OutputDynamicAlloc) { + Config config; + config.options.force_output_dynamic_alloc = true; + auto tensor = get_input_data("./input_data.npy"); + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + auto result_mgb = mgb_lar(model_path, config, input_name, tensor); + + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + + auto src_ptr = tensor->get_memory_ptr(); + auto src_layout = tensor->get_layout(); + input_tensor->reset(src_ptr, src_layout); + + std::shared_ptr output_tensor = network->get_output_tensor(0); + size_t times = 5; + for (size_t i = 0; i < times; i++) { + network->forward(); + network->wait(); + compare_lite_tensor(output_tensor, result_mgb); + } +} + TEST(TestNetWork, AsyncExec) { Config config; config.options.var_sanity_check_first_run = false; diff --git a/src/core/impl/graph/cg_impl.cpp b/src/core/impl/graph/cg_impl.cpp index b175489dce2426ae8c3816ad0c6efedd436e3109..add192779c6609bde3be6d0531791f48cba79609 100644 --- a/src/core/impl/graph/cg_impl.cpp +++ b/src/core/impl/graph/cg_impl.cpp @@ -507,13 +507,12 @@ void ComputingGraphImpl::dest_var_optimize(VarNodeArray& dest_vars) { i->add_flag(F::NO_MEM_RECLAIM); } } - if (dest_vars[0]->owner_graph()->options().force_output_write_to_user_memory) { + if (dest_vars[0]->owner_graph()->options().force_output_use_user_specified_memory) { for (auto&& i : dest_vars) { mgb_assert( !i->contain_flag(F::RT_FORCE_DYNAMIC_MEM_ALLOC), - "var %s with force dynamic allocate should be set to write output " - "to " - "user memory", + "var %s with RT_FORCE_DYNAMIC_MEM_ALLOC flag should not set " + "force write output to user memory", i->cname()); i->add_flag( F::NO_SYS_MEM_ALLOC | F::NO_SYS_STATIC_MEM_ALLOC | diff --git a/src/core/impl/graph/var_node.cpp b/src/core/impl/graph/var_node.cpp index 5c8909b51fa77693390621ebbe0359f25c541c74..833da401d6f7c8faedd80698d8a8dd883a5bdbd9 100644 --- a/src/core/impl/graph/var_node.cpp +++ b/src/core/impl/graph/var_node.cpp @@ -574,6 +574,10 @@ MemAllocPlan& VarNode::init_mem_plan(const DeviceTensorND* fixed_alloc) { return m_mem_plan; } +bool VarNode::is_graph_dest_varnode() { + return ComputingGraphImpl::downcast(owner_graph())->var_receiver(this).size() == 0; +} + VarNode& VarNode::add_flag(Flag flag) { modify_flag(flag, m_flag | flag); return *this; @@ -582,10 +586,13 @@ VarNode& VarNode::add_flag(Flag flag) { void VarNode::modify_flag(Flag delta, Flag new_flag) { if (contain_flag(Flag::FLAG_FREEZED)) { mgb_assert( - (delta & (Flag::NO_SYS_MEM_ALLOC | Flag::NO_MEM_RECLAIM | - Flag::NO_SYS_STATIC_MEM_ALLOC | - Flag::RT_FORCE_DYNAMIC_MEM_ALLOC)) == delta || - (new_flag & Flag::MEMORY_NO_NEED)); + (delta & (Flag::NO_MEM_RECLAIM | Flag::NO_SYS_STATIC_MEM_ALLOC | + Flag::RT_FORCE_DYNAMIC_MEM_ALLOC | Flag::MEMORY_NO_NEED)) == + delta || + is_graph_dest_varnode(), + "After the FLAG_FREEZED flag setting, var can only modify " + "NO_MEM_RECLAIM, NO_SYS_STATIC_MEM_ALLOC, RT_FORCE_DYNAMIC_MEM_ALLOC, " + "MEMORY_NO_NEED flag except graph dest var."); mgb_assert( !ComputingGraphImpl::downcast(owner_graph()) diff --git a/src/core/include/megbrain/graph/cg.h b/src/core/include/megbrain/graph/cg.h index 9d6c846fd7cdecaab72d15b9bec2e6eb270b59c5..ae1d4a16554eb8e598b682394781da95cf1c16c6 100644 --- a/src/core/include/megbrain/graph/cg.h +++ b/src/core/include/megbrain/graph/cg.h @@ -421,7 +421,7 @@ public: * Force the output to be written to the user specified memory, which * can optimize the copy of output data at one time */ - bool force_output_write_to_user_memory = false; + bool force_output_use_user_specified_memory = false; //! whether to perform var sanity check on first run bool var_sanity_check_first_run = true; diff --git a/src/core/include/megbrain/graph/var_node.h b/src/core/include/megbrain/graph/var_node.h index 611f482a1a94af430af54d243b138379ab21555a..74db600a267aa34eb72968223c6b91265c716c3f 100644 --- a/src/core/include/megbrain/graph/var_node.h +++ b/src/core/include/megbrain/graph/var_node.h @@ -549,6 +549,10 @@ private: MGE_WIN_DECLSPEC_FUC void modify_flag(Flag delta, Flag new_flag); + //! whether the var is graph output, if it is output, the Flag of + //! NO_SYS_MEM_ALLOC can be modified. + bool is_graph_dest_varnode(); + MGE_WIN_DECLSPEC_FUC void assign_dev_tensor_from_tensor( const DeviceTensorND& value); diff --git a/src/gopt/test/no_memory_copy.cpp b/src/gopt/test/no_memory_copy.cpp index f1c5f00678c3d74a7a17246fcc6df26085630c79..5e19f76467eea022695d67c47ace357a038797c5 100644 --- a/src/gopt/test/no_memory_copy.cpp +++ b/src/gopt/test/no_memory_copy.cpp @@ -82,7 +82,7 @@ TEST(TestNoCopy, BasicInputNoCopy) { TEST(TestNoCopy, IONoCopyPtrEQ) { auto test_graph = TestGraph(); auto compute_graph = test_graph.m_network->graph; - compute_graph->options().force_output_write_to_user_memory = true; + compute_graph->options().force_output_use_user_specified_memory = true; test_graph.create_graph(); auto func = test_graph.compile_without_copy(); auto&& outvar = func->get_output_vars()[0]; @@ -123,7 +123,7 @@ TEST(TestNoCopy, IONoCopyPtrEQ) { TEST(TestNoCopy, IONoCopyCorrect) { auto test_graph = TestGraph(); auto compute_graph = test_graph.m_network->graph; - compute_graph->options().force_output_write_to_user_memory = true; + compute_graph->options().force_output_use_user_specified_memory = true; test_graph.create_graph(); HostTensorND truth; auto func = test_graph.compile_without_copy();