diff --git a/src/core/impl/graph/cg_impl.cpp b/src/core/impl/graph/cg_impl.cpp index f897c7552030188041adec37eaf5c059991ee73d..b175489dce2426ae8c3816ad0c6efedd436e3109 100644 --- a/src/core/impl/graph/cg_impl.cpp +++ b/src/core/impl/graph/cg_impl.cpp @@ -492,6 +492,36 @@ SmallVector> ComputingGraphImpl::compile_multi_ #endif } +void ComputingGraphImpl::dest_var_optimize(VarNodeArray& dest_vars) { + using F = VarNode::Flag; + if (dest_vars[0]->owner_graph()->options().force_output_dynamic_alloc) { + for (auto&& i : dest_vars) { + if (!i->contain_flag(F::NO_SYS_MEM_ALLOC | F::NO_SYS_STATIC_MEM_ALLOC)) { + mgb_assert( + !i->contain_flag(F::DISALLOW_RT_FORCE_DYNAMIC_MEM_ALLOC), + "Can not force graph output dynamic alloc with " + "DISALLOW_RT_FORCE_DYNAMIC_MEM_ALLOC flag, var: %s", + i->cname()); + i->add_flag(F::NO_SYS_STATIC_MEM_ALLOC); + } + i->add_flag(F::NO_MEM_RECLAIM); + } + } + if (dest_vars[0]->owner_graph()->options().force_output_write_to_user_memory) { + for (auto&& i : dest_vars) { + mgb_assert( + !i->contain_flag(F::RT_FORCE_DYNAMIC_MEM_ALLOC), + "var %s with force dynamic allocate should be set to write output " + "to " + "user memory", + i->cname()); + i->add_flag( + F::NO_SYS_MEM_ALLOC | F::NO_SYS_STATIC_MEM_ALLOC | + F::NO_MEM_RECLAIM); + } + } +} + ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( const OutputSpec& out_spec) { auto&& cmpnt = components(); @@ -620,21 +650,7 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( std::unordered_map< CallbackCallerKey, CallbackCallerVal, CallbackCallerKey::Hash> opr2vars; - using F = VarNode::Flag; - if (dest_vars[0]->owner_graph()->options().force_output_dynamic_alloc) { - for (auto&& i : dest_vars) { - if (!i->contain_flag( - F::NO_SYS_MEM_ALLOC | F::NO_SYS_STATIC_MEM_ALLOC)) { - mgb_assert( - !i->contain_flag(F::DISALLOW_RT_FORCE_DYNAMIC_MEM_ALLOC), - "Can not force graph output dynamic alloc with " - "DISALLOW_RT_FORCE_DYNAMIC_MEM_ALLOC flag, var: %s", - i->cname()); - i->add_flag(F::NO_SYS_STATIC_MEM_ALLOC); - } - i->add_flag(F::NO_MEM_RECLAIM); - } - } + dest_var_optimize(dest_vars); for (size_t i = 0; i < out_spec.size(); ++i) { auto&& cb = out_spec[i].second; if (cb) { diff --git a/src/core/impl/graph/cg_impl.h b/src/core/impl/graph/cg_impl.h index 91fda0b222f8744204578cba7b6e7085a4221bf5..07b8b0bd133e62ee718609ca3011a1b5ba05ee4d 100644 --- a/src/core/impl/graph/cg_impl.h +++ b/src/core/impl/graph/cg_impl.h @@ -142,6 +142,9 @@ class ComputingGraphImpl final : public ComputingGraph { //! finalize the computing sequence for compiling std::unique_ptr compile_commit(CompileState state); + //! process the dest var optimization + void dest_var_optimize(VarNodeArray& dest_vars); + public: class ComputingSequence; diff --git a/src/core/impl/graph/var_node.cpp b/src/core/impl/graph/var_node.cpp index c8900a74fa8a6ff2e811214f0c21b3969cd5752c..5c8909b51fa77693390621ebbe0359f25c541c74 100644 --- a/src/core/impl/graph/var_node.cpp +++ b/src/core/impl/graph/var_node.cpp @@ -582,7 +582,8 @@ VarNode& VarNode::add_flag(Flag flag) { void VarNode::modify_flag(Flag delta, Flag new_flag) { if (contain_flag(Flag::FLAG_FREEZED)) { mgb_assert( - (delta & (Flag::NO_MEM_RECLAIM | Flag::NO_SYS_STATIC_MEM_ALLOC | + (delta & (Flag::NO_SYS_MEM_ALLOC | Flag::NO_MEM_RECLAIM | + Flag::NO_SYS_STATIC_MEM_ALLOC | Flag::RT_FORCE_DYNAMIC_MEM_ALLOC)) == delta || (new_flag & Flag::MEMORY_NO_NEED)); diff --git a/src/core/include/megbrain/graph/cg.h b/src/core/include/megbrain/graph/cg.h index 3dad064817189a87783bb6790ed45e2ef6bb3b89..9d6c846fd7cdecaab72d15b9bec2e6eb270b59c5 100644 --- a/src/core/include/megbrain/graph/cg.h +++ b/src/core/include/megbrain/graph/cg.h @@ -417,6 +417,12 @@ public: */ bool force_output_dynamic_alloc = false; + /*! + * Force the output to be written to the user specified memory, which + * can optimize the copy of output data at one time + */ + bool force_output_write_to_user_memory = false; + //! whether to perform var sanity check on first run bool var_sanity_check_first_run = true; diff --git a/src/gopt/test/no_memory_copy.cpp b/src/gopt/test/no_memory_copy.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f1c5f00678c3d74a7a17246fcc6df26085630c79 --- /dev/null +++ b/src/gopt/test/no_memory_copy.cpp @@ -0,0 +1,165 @@ +/** + * \file src/gopt/test/no_memory_copy.cpp + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include +#include "./network.h" +#include "megbrain/comp_node_env.h" +#include "megbrain/opr/basic_arith.h" +#include "megbrain/test/helper.h" + +using namespace mgb; + +struct TestGraph { + CompNode m_cn; + HostTensorGenerator<> m_gen; + std::unique_ptr m_network; + SymbolVar m_out_var; + std::shared_ptr input_tensor; + + TestGraph() { + m_cn = CompNode::load("cpu0"); + m_network = std::make_unique(m_cn); + } + + void create_graph() { + input_tensor = m_gen({1, 3, 32, 32}, m_cn); + auto input = opr::Host2DeviceCopy::make(*m_network->graph, input_tensor, m_cn) + .rename("input"); + auto f = m_network->add_conv( + input, 4, {3, 3}, dtype::Float32(), true, {2, 2}, {0, 0}); + f = m_network->add_elemwise( + {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); + f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); + m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); + } + + std::unique_ptr compile_without_copy() { + return m_network->graph->compile({{m_out_var, nullptr}}); + } + + std::unique_ptr compile_with_copy(HostTensorND& host) { + auto cb = [&host](const DeviceTensorND& dv) mutable { host.copy_from(dv); }; + return m_network->graph->compile({{m_out_var, std::move(cb)}}); + } +}; + +TEST(TestNoCopy, BasicInputNoCopy) { + auto test_graph = TestGraph(); + test_graph.create_graph(); + HostTensorND out, out_pre; + auto func = test_graph.compile_with_copy(out); + size_t times = 10; + for (size_t i = 0; i < times; i++) { + if (i % 2 == 0) { + auto input_tensor = test_graph.input_tensor; + auto layout = input_tensor->layout(); + size_t length = layout.total_nr_elems(); + auto storage = TensorStorage(test_graph.m_cn); + storage.ensure_size(length * sizeof(float)); + float* ptr = storage.ptr()->as(); + for (size_t d = 0; d < length; d++) { + ptr[d] = i; + } + input_tensor->reset(storage, layout); + } + func->execute(); + func->wait(); + if (i % 2 != 0) { + MGB_ASSERT_TENSOR_EQ(out, out_pre); + } + out_pre.copy_from(out).sync(); + } +} + +TEST(TestNoCopy, IONoCopyPtrEQ) { + auto test_graph = TestGraph(); + auto compute_graph = test_graph.m_network->graph; + compute_graph->options().force_output_write_to_user_memory = true; + test_graph.create_graph(); + auto func = test_graph.compile_without_copy(); + auto&& outvar = func->get_output_vars()[0]; + DeviceTensorND dv0(test_graph.m_cn, {1, 8, 7, 7}); + DeviceTensorND dv1(test_graph.m_cn, {1, 8, 7, 7}); + size_t times = 10; + for (size_t i = 0; i < times; i++) { + auto input_tensor = test_graph.input_tensor; + auto layout = input_tensor->layout(); + size_t length = layout.total_nr_elems(); + auto storage = TensorStorage(test_graph.m_cn); + storage.ensure_size(length * sizeof(float)); + float* ptr = storage.ptr()->as(); + for (size_t d = 0; d < length; d++) { + ptr[d] = i; + } + input_tensor->reset(storage, layout); + if (i % 2 == 0) { + outvar->init_mem_plan(&dv0); + outvar->reset_dev_tensor_from_tensor(dv0); + } else { + outvar->init_mem_plan(&dv1); + outvar->reset_dev_tensor_from_tensor(dv1); + } + + func->execute(); + func->wait(); + auto out = func->get_output_vars()[0]->dev_tensor().ptr(); + + if (i % 2 == 0) { + ASSERT_EQ(dv0.ptr(), out); + } else { + ASSERT_EQ(dv1.ptr(), out); + } + } +} + +TEST(TestNoCopy, IONoCopyCorrect) { + auto test_graph = TestGraph(); + auto compute_graph = test_graph.m_network->graph; + compute_graph->options().force_output_write_to_user_memory = true; + test_graph.create_graph(); + HostTensorND truth; + auto func = test_graph.compile_without_copy(); + //! because the output tensor not assign user memory, so it will wrong + ASSERT_THROW(func->execute(), MegBrainError); + auto&& outvar = func->get_output_vars()[0]; + size_t times = 10; + for (size_t i = 0; i < times; i++) { + auto input_tensor = test_graph.input_tensor; + auto layout = input_tensor->layout(); + size_t length = layout.total_nr_elems(); + auto storage = TensorStorage(test_graph.m_cn); + storage.ensure_size(length * sizeof(float)); + float* ptr = storage.ptr()->as(); + for (size_t d = 0; d < length; d++) { + ptr[d] = i / 5 + 3; + } + input_tensor->reset(storage, layout); + DeviceTensorND dv(test_graph.m_cn, {1, 8, 7, 7}); + outvar->init_mem_plan(&dv); + outvar->reset_dev_tensor_from_tensor(dv); + + func->execute(); + func->wait(); + if (i % 5 == 0) { + truth.copy_from(func->get_output_vars()[0]->dev_tensor()).sync(); + continue; + } + HostTensorND to_check; + to_check.copy_from(func->get_output_vars()[0]->dev_tensor()).sync(); + MGB_ASSERT_TENSOR_EQ(to_check, truth); + } +} + +TEST(TestNoCopy, InputNoCopyRecord) {} + +TEST(TestNoCopy, OutputNoCopyRecord) {} + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}