diff --git a/imperative/python/megengine/jit/__init__.py b/imperative/python/megengine/jit/__init__.py index 83878983ebbc35e832e04768db9ce5443e9504f5..5f5db441472d3451d6325ed84ae52d574a7a7c26 100644 --- a/imperative/python/megengine/jit/__init__.py +++ b/imperative/python/megengine/jit/__init__.py @@ -10,6 +10,7 @@ from ..core._imperative_rt.core2 import ( set_cpp_apply_const_with_tracing, set_cpp_apply_with_tracing, ) +from .dtr_config import DTRConfig from .sublinear_memory_config import SublinearMemoryConfig from .tracing import ( apply_const_with_tracing, diff --git a/imperative/python/megengine/jit/dtr_config.py b/imperative/python/megengine/jit/dtr_config.py new file mode 100644 index 0000000000000000000000000000000000000000..39b363b575c03da4e3820aa862ccce10e2badb3a --- /dev/null +++ b/imperative/python/megengine/jit/dtr_config.py @@ -0,0 +1,20 @@ +# -*- coding: utf-8 -*- +# MegEngine is Licensed under the Apache License, Version 2.0 (the "License") +# +# Copyright (c) 2014-2021 Megvii Inc. All rights reserved. +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + + +class DTRConfig: + def __init__( + self, eviction_threshold: int = 0, evictee_minimum_size: int = 1 << 20 + ): + assert eviction_threshold > 0, "eviction_threshold must be greater to zero" + self.eviction_threshold = eviction_threshold + assert ( + evictee_minimum_size >= 0 + ), "evictee_minimum_size must be greater or equal to zero" + self.evictee_minimum_size = evictee_minimum_size diff --git a/imperative/python/megengine/jit/tracing.py b/imperative/python/megengine/jit/tracing.py index d17d6e711d04a0b553505150dcba46587e47e597..31127281085120c03c804a5a0728f36e9139a6ee 100644 --- a/imperative/python/megengine/jit/tracing.py +++ b/imperative/python/megengine/jit/tracing.py @@ -37,6 +37,7 @@ from ..core.ops.special import Const from ..core.tensor import megbrain_graph as G from ..core.tensor.utils import setscalar from ..utils.naming import AutoNaming +from .dtr_config import DTRConfig from .sublinear_memory_config import SublinearMemoryConfig @@ -142,6 +143,7 @@ class trace: symbolic=False, capture_as_const=False, sublinear_memory_config: SublinearMemoryConfig = None, + dtr_config: DTRConfig = None, profiling: bool = False, opt_level: int = 2, symbolic_shape: bool = True, @@ -150,6 +152,7 @@ class trace: self._symbolic = symbolic self._capture_as_const = capture_as_const self._sublinear_memory_config = sublinear_memory_config + self._dtr_config = dtr_config self._profiling = profiling self._profiler = None self._graph_opt_level = opt_level @@ -491,6 +494,15 @@ class trace: graph.options.no_force_inplace = True graph.options.seq_opt.enable_seq_comp_node_opt = False graph.options.graph_opt_level = self._graph_opt_level + if self._dtr_config is not None: + graph.options.enable_dtr_memory_opt = True + graph.options.dtr_config.eviction_threshold = ( + self._dtr_config.eviction_threshold + ) + graph.options.dtr_config.evictee_minimum_size = ( + self._dtr_config.evictee_minimum_size + ) + # sublinear if self._sublinear_memory_config is not None: graph.options.enable_sublinear_memory_opt = True diff --git a/imperative/python/src/graph_rt.cpp b/imperative/python/src/graph_rt.cpp index 8bb8f5cabfe2e50219d048f89ec598966a3105d0..60b391b26bc36584a921e87773e51354f403a8d3 100644 --- a/imperative/python/src/graph_rt.cpp +++ b/imperative/python/src/graph_rt.cpp @@ -395,6 +395,7 @@ void init_graph_rt(py::module m) { DEF_READWRITE(allocate_static_mem_after_graph_compile) DEF_READWRITE(fake_next_exec) DEF_READWRITE(enable_sublinear_memory_opt) + DEF_READWRITE(enable_dtr_memory_opt) DEF_READWRITE(no_profiling_on_shape_change) DEF_READWRITE(enable_var_mem_defragment) DEF_READWRITE(enable_grad_var_static_reshape) @@ -402,6 +403,7 @@ void init_graph_rt(py::module m) { DEF_READWRITE(comp_node_seq_record_level) DEF_READWRITE(no_force_inplace) DEF_READWRITE(sublinear_mem_config) + DEF_READWRITE(dtr_config) // DEF_READWRITE(eager_evaluation) // DEF_READWRITE(imperative_proxy_graph) // DEF_READWRITE(extra_vardeps) @@ -434,6 +436,14 @@ void init_graph_rt(py::module m) { DEF_READWRITE(lb_memory) DEF_READWRITE(num_worker); +#undef CURRENT_CLASS + +#define CURRENT_CLASS cg::ComputingGraph::Options::DTRConfig + + py::class_(PyComputingGraphOptions, "DTRConfig") + DEF_READWRITE(eviction_threshold) + DEF_READWRITE(evictee_minimum_size); + #undef CURRENT_CLASS auto common = rel_import("common", m, 1); diff --git a/src/core/impl/graph/cg_impl.cpp b/src/core/impl/graph/cg_impl.cpp index 25386ee7636ad2589676e2d89fe27212fbf90d24..af1707193bb03fa9f82c2fe0052a91f077dae38d 100644 --- a/src/core/impl/graph/cg_impl.cpp +++ b/src/core/impl/graph/cg_impl.cpp @@ -250,6 +250,10 @@ ComputingGraphImpl::Components::Components(ComputingGraphImpl* owner) seq_modifier_for_sublinear_memory{owner, &(owner->options().sublinear_mem_config)}, #endif +#if MGB_ENABLE_DTR + seq_modifier_for_dtr{owner, + &(owner->options().dtr_config)}, +#endif #if MGB_ENABLE_MEMORY_SWAP memory_swap_support{owner}, #endif @@ -473,6 +477,7 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( #if MGB_ENABLE_SUBLINEAR if (options().enable_sublinear_memory_opt) { + mgb_assert(!options().enable_dtr_memory_opt); if (!sopr_stat.has_virtual_grad) { mgb_log_debug( "no virtual grad var; sublinear memory may produce " @@ -485,6 +490,15 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( mgb_assert(!options().enable_sublinear_memory_opt); #endif // MGB_ENABLE_SUBLINEAR +#if MGB_ENABLE_DTR + if (options().enable_dtr_memory_opt) { + mgb_assert(!options().enable_sublinear_memory_opt); + seq_modifier_for_dtr().set_priority_before_opt(dest_vars); + } +#else + mgb_assert(!options().enable_dtr_memory_opt); +#endif // MGB_ENABLE_DTR + #if !MGB_BUILD_SLIM_SERVING mgb_assert(!options().eager_evaluation, "attempt to compile eager_evaluation graph"); @@ -558,7 +572,10 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( CompSeqExtraInfo extra_info; cmpnt.seq_comp_node_opt.optimize_comp_nodes(dest_vars); + bool init_flag = false; auto init_opr_seq = [&]() { + mgb_assert(!init_flag); + init_flag = true; ThinHashMap var2idx; std::unordered_map @@ -629,6 +646,15 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( mgb_assert(!options().enable_memory_swap); #endif +#if MGB_ENABLE_DTR + if (options().enable_dtr_memory_opt) { + MGB_TRY { + seq_modifier_for_dtr().modify_endpoint_vars(dest_vars); + init_opr_seq(); + } + MGB_FINALLY(seq_modifier_for_dtr().restore_graph_option()); + } +#endif #if MGB_ENABLE_SUBLINEAR if (options().enable_sublinear_memory_opt) { MGB_TRY { @@ -650,12 +676,11 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( */ seq_modifier_for_sublinear_memory().restore_graph_option()); seq_modifier_for_sublinear_memory().sanity_check(*opr_seq); - } else { - init_opr_seq(); } -#else - init_opr_seq(); #endif // MGB_ENABLE_SUBLINEAR + if (!init_flag) { + init_opr_seq(); + } return {std::move(extra_info), opr_seq, std::move(dest_vars)}; } @@ -751,6 +776,13 @@ ComputingGraphImpl::seq_modifier_for_sublinear_memory() { } #endif +#if MGB_ENABLE_DTR +SeqModifierForDTR& +ComputingGraphImpl::seq_modifier_for_dtr() { + return components().seq_modifier_for_dtr; +} +#endif + void ComputingGraphImpl::share_device_memory_with(ComputingGraph& other) { mgb_assert( !m_current_comp_seq, diff --git a/src/core/impl/graph/cg_impl.h b/src/core/impl/graph/cg_impl.h index fb23ffcd0b26a2f03f35e4d69a8daa55e960977c..478cb70cbd74cac1c45051f8a986baa2e5ece77f 100644 --- a/src/core/impl/graph/cg_impl.h +++ b/src/core/impl/graph/cg_impl.h @@ -15,6 +15,7 @@ #include "./grad_manager.h" #include "./graph_opt.h" #include "./seq_comp_node_opt_impl.h" +#include "./seq_dtr.h" #include "./seq_sublinear_memory.h" #include "./static_infer_impl.h" #include "./swap/memory_swap.h" @@ -80,6 +81,9 @@ class ComputingGraphImpl final : public ComputingGraph { #if MGB_ENABLE_SUBLINEAR SeqModifierForSublinearMemory seq_modifier_for_sublinear_memory; #endif +#if MGB_ENABLE_DTR + SeqModifierForDTR seq_modifier_for_dtr; +#endif #if MGB_ENABLE_MEMORY_SWAP swap::MemorySwap memory_swap_support; #endif @@ -218,6 +222,9 @@ public: SeqModifierForSublinearMemory& seq_modifier_for_sublinear_memory(); #endif +#if MGB_ENABLE_DTR + SeqModifierForDTR& seq_modifier_for_dtr(); +#endif void share_device_memory_with(ComputingGraph& other) override; void set_device_memory_allocator( diff --git a/src/core/impl/graph/seq_dtr.cpp b/src/core/impl/graph/seq_dtr.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6a47b0f566a610049d168be707e72c4ade460961 --- /dev/null +++ b/src/core/impl/graph/seq_dtr.cpp @@ -0,0 +1,368 @@ +/** + * \file src/core/impl/graph/seq_dtr.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "./seq_dtr.h" + +#if MGB_ENABLE_DTR + +using namespace mgb; +using namespace cg; + +namespace { + +bool is_bad_opr(OperatorNodeBase* opr) { + using F = OperatorNodeBase::NodeProp::Flag; + return opr->node_prop().contain( + F::IMPURE_FUNC | F::NO_AUTOMATIC_DUP | F::FORCE_UPDATE_INPUT_VAR); +} + +} // namespace + +class SeqModifierForDTR::ModifyActionPlanner : public ModifyActionPlannerBase { +public: + ModifyActionPlanner(SeqModifierBase* par) : ModifyActionPlannerBase{par} {} + + void prepare(const OprNodeArray& opr_seq); + + SeqModifyAction perform_dtr(CompNode comp_node, const OprNodeArray& seq, Config* config); +}; + + +SeqModifierForDTR::SeqModifierForDTR(ComputingGraphImpl* owner, Config* config_g) + : SeqModifierBase(owner), m_config(config_g) {} + +void SeqModifierForDTR::modify_endpoint_vars(VarNodeArray& endpoints) { + var_map().clear(); + auto comp_seq = MemoryOptimizerHelper::CompSeq(owner_graph(), endpoints); + auto config = + MemoryOptimizerHelper::SubGraphConfig() + /*.add_bad_opr_flag( + OperatorNodeBase::NodeProp::Flag::IMPURE_FUNC) + .add_bad_opr_flag( + OperatorNodeBase::NodeProp::Flag::NO_AUTOMATIC_DUP) + .add_bad_opr_flag(OperatorNodeBase::NodeProp::Flag:: + FORCE_UPDATE_INPUT_VAR)*/ + // NOTE: it should not actually involve any opr with the above + // flags, but for better results, some ops(e.g. CudnnBatchNorm) + // should be involved and they are guaranteed to NEVER recompute. + .add_bad_var_flag(VarNode::Flag::VOLATILE_CONTENT) + .add_bad_var_flag(VarNode::Flag::NO_SYS_STATIC_MEM_ALLOC) + .add_bad_var_flag(VarNode::Flag::NO_SYS_MEM_ALLOC) + .add_bad_var_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE); + auto cn2oprseq = mem_opt().split_into_cn2oprseq(*comp_seq.m_seq, config); + + if (cn2oprseq->empty()) { + return; + } + SeqModifyAction action; + ModifyActionPlanner* planner = new ModifyActionPlanner(this); + for (auto && i : *cn2oprseq) { + auto&& cur = planner->perform_dtr(i.first, i.second, m_config); + action.insert(cur.begin(), cur.end()); + } + apply_action(action, *comp_seq.m_seq); + for (auto&& i : endpoints) { + auto iter = var_map().find(i); + if (iter != var_map().end()) { + i = iter->second; + } + } +} + +void SeqModifierForDTR::ModifyActionPlanner::prepare(const OprNodeArray& opr_seq) { + init_seq(opr_seq, false); + + for (size_t i = 0; i < seq().size(); ++i) { + auto opr = seq()[i].get(); + size_t est = 0; + for (auto i : opr->input) { + est += i->size; + } + for (auto i : opr->output) { + est += i->size; + } + opr->estimate_compute_time = static_cast(est) / 1e8; + } +} + +SeqModifierForDTR::SeqModifyAction SeqModifierForDTR::ModifyActionPlanner::perform_dtr( + CompNode comp_node, const OprNodeArray& opr_seq, Config* config) { + prepare(opr_seq); + SeqModifyAction action; + + if (comp_node.locator().stream < 0) { + // do not modify system stream oprs + return action; + } + + ThinHashSet alive_vars; + size_t cur_usage = 0; + + //! map from original var to latest var + ThinHashMap latest_var; + ThinHashMap pin; + + auto need_regen = [&](Var* var) { + return alive_vars.find(var) == alive_vars.end(); + }; + + auto add_alive = [&](Var* var) { + auto&& ins = alive_vars.insert(var); + mgb_assert(ins.second); + cur_usage += var->size; + }; + + auto remove_alive = [&](Var* var) { + if (alive_vars.erase(var)) { + auto size = var->size; + mgb_assert(size <= cur_usage); + cur_usage -= size; + } + }; + + auto get_latest = [&](Var* var) { + auto iter = latest_var.find(var->orig_var); + if (iter == latest_var.end()) { + return var; + } else { + return iter->second; + } + }; + + double est_time = 0; + + ThinHashMap dfs_back; + ThinHashMap dfs_front; + + auto regen_time = [&](Var* var) { + thin_function dfs_b; + thin_function dfs_f; + dfs_b = [&](Var* var) { + if (dfs_back.find(var) != dfs_back.end()) { + return dfs_back[var]; + } + auto opr = var->owner_opr(); + double sum_time = opr->estimate_compute_time; + for (auto i : opr->input) { + auto ivar = get_latest(i); + if (need_regen(ivar)) { + sum_time += dfs_b(ivar); + } + } + dfs_back[var] = sum_time; + return sum_time; + }; + dfs_f = [&](Var* var) { + if (dfs_front.find(var) != dfs_front.end()) { + return dfs_front[var]; + } + double sum_time = 1; + for (size_t j = 1; j < var->access_rec.size();j ++) { + auto dep_opr = var->access_rec[j].opr; + for (auto o : dep_opr->output) { + o = get_latest(o); + if (need_regen(o)) { + sum_time += dfs_f(o); + } + } + } + dfs_front[var] = sum_time; + return sum_time; + }; + return dfs_f(var) * dfs_b(var); + }; + + static constexpr double MAX_EVAL_VALUE = std::numeric_limits::max(); + auto find_best = [&]() { + Var* best = nullptr; + double min_eval_value = MAX_EVAL_VALUE; + dfs_back.clear(); + dfs_front.clear(); + for (auto var : alive_vars) { + if (var->size < config->evictee_minimum_size + || pin[var->orig_var] > 0 + || is_bad_opr(var->owner_opr()->orig_opr)) { + continue; + } + double regen = regen_time(var); + double eval_value = regen / static_cast(var->size) + / (est_time - var->last_access_time + 1e-8); + if (eval_value < min_eval_value) { + min_eval_value = eval_value; + best = var; + } + } + return best; + }; + + auto do_evict = [&](Var* var) { + remove_alive(var); + }; + + auto auto_evict = [&](size_t needed) { + while (cur_usage + needed >= config->eviction_threshold) { + Var* v = find_best(); + if (!v) { + break; + } + do_evict(v); + } + }; + + thin_function regenerate; + regenerate = [&](Opr* reader, Var* var) { + auto opr = var->owner_opr(); + // FIXME: if var can not be recomputed, the previous eviction may fail + if (is_bad_opr(opr->orig_opr)) { + return var; + } + + auto new_opr_storage = opr_mempool().alloc_unique(opr->orig_opr, static_cast(DUPOPR_TIME)); + auto new_opr = new_opr_storage.get(); + + new_opr->input.reserve(opr->input.size()); + new_opr->output.reserve(opr->output.size()); + + for (auto i : opr->input) { + i->last_access_time = est_time; + pin[i->orig_var] ++; + } + for (auto o : opr->output) { + auto lo = get_latest(o); + if (!need_regen(lo)) { + remove_alive(lo); + } + } + for (auto i : opr->input) { + auto ivar = get_latest(i); + if (need_regen(ivar)) { + ivar = regenerate(reader, ivar); + } + new_opr->input.push_back(ivar); + ivar->access_rec.emplace_back(new_opr); + } + + reader->oprs_insert_before.emplace_back(std::move(new_opr_storage)); + + size_t needed = 0; + for (auto o : opr->output) { + needed += o->size; + } + auto_evict(needed); + Var* new_var = nullptr; + for (auto o : opr->output) { + auto lo = get_latest(o); + auto&& ovar = var_mempool().alloc_unique(lo->orig_var, lo->size, + new_opr); + ovar->recomp_id = lo->recomp_id + 1; + new_opr->output.push_back(ovar.get()); + if (o == var) { + new_var = ovar.get(); + } + add_alive(ovar.get()); + ovar->last_access_time = est_time; + latest_var[o->orig_var] = ovar.get(); + var_storage().emplace_back(std::move(ovar)); + } + est_time += opr->estimate_compute_time; + for (auto i : opr->input) { + pin[i->orig_var] --; + } + return new_var; + }; + + for (size_t j = 0; j < seq().size(); ++j) { + auto opr = seq()[j].get(); + for (auto i : opr->input) { + pin[i->orig_var] ++; + } + for (auto i : opr->input) { + i = get_latest(i); + if (need_regen(i)) { + i = regenerate(opr, i); + } + i->last_access_time = est_time; + } + size_t needed = 0; + for (auto o : opr->output) { + needed += o->size; + } + auto_evict(needed); + est_time += opr->estimate_compute_time; + for (auto o : opr->output) { + add_alive(o); + o->last_access_time = est_time; + } + for (auto i : opr->input) { + pin[i->orig_var] --; + } + for (auto i : opr->input) { + i = get_latest(i); + if (opr == i->last_access_opr()) + remove_alive(i); + } + } + for (size_t j = 0; j < seq().size(); ++j) { + auto opr = seq()[j].get(); + auto&& arr = opr->oprs_insert_before; + if (arr.empty()) { + continue; + } + auto&& dest = action[opr->orig_opr]; + dest.reserve(arr.size()); + for (auto&& i : arr) { + dest.push_back(i->orig_opr); + } + } + return action; +} + +void SeqModifierForDTR::apply_action(SeqModifyAction& action, + const OprNodeArray& oprseq) { + auto cur_priority = std::numeric_limits::min(); + + ThinHashSet modified_opr; + ThinHashMap recomp_id; + auto set_priority = [&](OperatorNodeBase* opr) { + mgb_assert(modified_opr.insert(opr).second); + mem_opt().set_priority(opr, cur_priority++); + }; + + auto on_opr_visited = [&](OperatorNodeBase* opr) { + if (replace_vars(opr->input())) { + recomp_id[opr] ++; + opr = copy_opr_from_new_inputs(opr, true, recomp_id[opr] - 1); + } + set_priority(opr); + }; + + DepOprIter dep_iter{on_opr_visited}; + + for (auto opr : oprseq) { + auto iter = action.find(opr); + if (iter != action.end()) { + for (auto i : iter->second) { + replace_vars(i->input()); + recomp_id[i] ++; + auto opr_new = copy_opr_from_new_inputs(i, false, recomp_id[i] - 1); + set_priority(opr_new); + } + action.erase(iter); + } + dep_iter.add(opr); + } + mgb_assert(action.empty()); +} + +#endif // !MGB_ENABLE_DTR + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/core/impl/graph/seq_dtr.h b/src/core/impl/graph/seq_dtr.h new file mode 100644 index 0000000000000000000000000000000000000000..e9b86d5c0c4b9f7126144b127d3a24379e031aa1 --- /dev/null +++ b/src/core/impl/graph/seq_dtr.h @@ -0,0 +1,43 @@ +/** + * \file src/core/impl/graph/seq_dtr.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "./memory_optimizer.h" +#include "./seq_modifier_base.h" +#include "megbrain/graph/cg.h" + +#if MGB_ENABLE_DTR + +namespace mgb { +namespace cg { + +class SeqModifierForDTR : public SeqModifierBase { + //! Config options + using Config = mgb::cg::ComputingGraph::Options::DTRConfig; + Config* m_config; + + class ModifyActionPlanner; + +public: + SeqModifierForDTR(ComputingGraphImpl* owner, Config* config_g); + + void modify_endpoint_vars(VarNodeArray& endpoints); + + void apply_action(SeqModifyAction& action, const OprNodeArray& oprseq); +}; + +} // namespace cg +} // namespace mgb + +#endif // MGB_ENABLE_DTR + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/core/impl/graph/seq_modifier_base.cpp b/src/core/impl/graph/seq_modifier_base.cpp index 0eaa9d3281c9602f4c370b14597233e4eb293784..5c2d45d5784b73d84626eadfaacd7a2a2219b448 100644 --- a/src/core/impl/graph/seq_modifier_base.cpp +++ b/src/core/impl/graph/seq_modifier_base.cpp @@ -11,12 +11,12 @@ #include "./seq_modifier_base.h" -#if MGB_ENABLE_SUBLINEAR +#if MGB_ENABLE_SUBLINEAR || MGB_ENABLE_DTR using namespace mgb; using namespace cg; -void SeqModifierBase::ModifyActionPlannerBase::init_seq(const OprNodeArray& opr_seq) { +void SeqModifierBase::ModifyActionPlannerBase::init_seq(const OprNodeArray& opr_seq, bool remove_unused_output) { m_orig_opr_seq = &opr_seq; m_var_storage.clear(); @@ -76,15 +76,16 @@ void SeqModifierBase::ModifyActionPlannerBase::init_seq(const OprNodeArray& opr_ mgb_assert(!opr->output.empty()); } - // remove unused output - for (auto&& i : m_seq) { - auto&& oarr = i->output; - for (size_t j = 0; j < oarr.size();) { - if (oarr[j]->access_rec.size() == 1) { - std::swap(oarr[j], oarr.back()); - oarr.pop_back(); - } else - ++j; + if (remove_unused_output) { + for (auto&& i : m_seq) { + auto&& oarr = i->output; + for (size_t j = 0; j < oarr.size();) { + if (oarr[j]->access_rec.size() == 1) { + std::swap(oarr[j], oarr.back()); + oarr.pop_back(); + } else + ++j; + } } } } @@ -105,17 +106,14 @@ bool SeqModifierBase::replace_vars(const VarNodeArray& inputs) { OperatorNodeBase* SeqModifierBase::copy_opr_from_new_inputs( OperatorNodeBase* opr, bool recomp, size_t recomp_cnt) { auto config = opr->config(); - // update operator instance id to bybass the shallow copy's cache if - // it's a dup-opr-copying due to discarding. - // Don't update instance id by `this` pointer if it's a recomp-opr-copying - // because: - // 0) recomp-opr would be copied iff its input vars is changed - // 1) some pair of recomp-opr and dup-opr have the same inputs, params - // and config, we use instance id to differentiate them. + // update operator instance id to bybass the shallow copy's cache because + // some pair of recomp-opr and dup-opr have the same inputs, params and + // config, we use instance id to differentiate them. To be safe, we update + // instance id whatever reason is `recomp` or `dup` config.name(opr->name() + (recomp ? ":recomp" : ":dup") + std::to_string(recomp_cnt)); config.update_instance_id(reinterpret_cast( reinterpret_cast(this) + - ((static_cast(recomp) + 1) << 10) * recomp_cnt)); + (recomp_cnt << 1 | (recomp & 1)))); // Note: if all outputs of op were placed on the same comp_node, since its // stream maybe changed during seq_comp_node_opt, output's comp_node has @@ -156,6 +154,6 @@ OperatorNodeBase* SeqModifierBase::copy_opr_from_new_inputs( return opr_new; } -#endif // MGB_ENABLE_SUBLINEAR +#endif // MGB_ENABLE_SUBLINEAR || MGB_ENABLE_DTR // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} \ No newline at end of file diff --git a/src/core/impl/graph/seq_modifier_base.h b/src/core/impl/graph/seq_modifier_base.h index 6ce74c80551ea75229248c48960c824332847d1a..450f01b9c6e88f88e35e005c6928ab5cc1f9cb14 100644 --- a/src/core/impl/graph/seq_modifier_base.h +++ b/src/core/impl/graph/seq_modifier_base.h @@ -17,12 +17,11 @@ #include "megbrain/plugin/opr_footprint.h" #include "megbrain/serialization/opr_shallow_copy.h" #include "megbrain/system.h" -#include "megbrain/utils/async_worker.h" #include "megbrain/utils/arith_helper.h" #include "megbrain/utils/mempool.h" #include "megbrain/utils/timer.h" -#if MGB_ENABLE_SUBLINEAR +#if MGB_ENABLE_SUBLINEAR || MGB_ENABLE_DTR namespace mgb { namespace cg { @@ -57,11 +56,11 @@ public: static constexpr size_t DUPOPR_TIME = std::numeric_limits::max() - 1; - const SeqModifierBase* const par_modifier() { + auto& par_modifier() { return m_par_modifier; } - const OprNodeArray* const orig_opr_seq() { + auto& orig_opr_seq() { return m_orig_opr_seq; } @@ -94,7 +93,7 @@ public: } //! init m_orig_opr_seq from opr_seq, should be called first. - void init_seq(const OprNodeArray& opr_seq); + void init_seq(const OprNodeArray& opr_seq, bool remove_unused_output=true); }; SeqModifierBase(ComputingGraphImpl* owner) : m_mem_opt(owner), m_owner_graph(owner) {} @@ -103,7 +102,7 @@ public: return m_mem_opt; } - ComputingGraphImpl* const owner_graph() { + auto& owner_graph() { return m_owner_graph; } @@ -232,6 +231,6 @@ struct SeqModifierBase::Var { } // namespace cg } // namespace mgb -#endif // MGB_ENABLE_SUBLINEAR +#endif // MGB_ENABLE_SUBLINEAR || MGB_ENABLE_DTR // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} \ No newline at end of file diff --git a/src/core/include/megbrain/graph/bases.h b/src/core/include/megbrain/graph/bases.h index ed8b588824d741b5aef2b3874d192debc657a72f..6b79d385c698b7d35921b5e845a0d4f2e9407f91 100644 --- a/src/core/include/megbrain/graph/bases.h +++ b/src/core/include/megbrain/graph/bases.h @@ -18,6 +18,10 @@ #include +#ifndef MGB_ENABLE_DTR +#define MGB_ENABLE_DTR ((!MGB_BUILD_SLIM_SERVING) && (!!MGB_HAVE_THREAD)) +#endif // MGB_ENABLE_DTR + #ifndef MGB_ENABLE_SUBLINEAR #define MGB_ENABLE_SUBLINEAR ((!MGB_BUILD_SLIM_SERVING) && (!!MGB_HAVE_THREAD)) #endif // MGB_ENABLE_SUBLINEAR diff --git a/src/core/include/megbrain/graph/cg.h b/src/core/include/megbrain/graph/cg.h index 15467552511a9b3ab8e0dfa9d3c26362145e4b85..ce4524fa959a8aeae5e24ca457b1b5532f0a4550 100644 --- a/src/core/include/megbrain/graph/cg.h +++ b/src/core/include/megbrain/graph/cg.h @@ -433,6 +433,15 @@ class ComputingGraph : public std::enable_shared_from_this, int num_worker = sys::get_cpu_count() / 2; } sublinear_mem_config; + //! whether to enable DTR memory optimization + bool enable_dtr_memory_opt = false; + + //! Control parameter for DTR memory optimization + struct DTRConfig { + size_t eviction_threshold = 0; + size_t evictee_minimum_size = 1ULL << 20; + } dtr_config; + //! do not re-profile to select best impl algo when input shape //! changes (use previous algo) bool no_profiling_on_shape_change = false; diff --git a/src/core/test/sublinear_memory.cpp b/src/core/test/sublinear_memory.cpp index 83f26f587ec95155ec2f44fff0f85b0c5c3db428..0d0aaf0aa234323eb09163864f4b5b80e240fb9a 100644 --- a/src/core/test/sublinear_memory.cpp +++ b/src/core/test/sublinear_memory.cpp @@ -172,6 +172,15 @@ TEST(TestSublinearMemory, FullConv) { } } + for (size_t i = 0; i < grad_params_get.size(); ++i) + MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); + + graph->options().enable_sublinear_memory_opt = false; + graph->options().enable_dtr_memory_opt = true; + graph->options().dtr_config.eviction_threshold = 1ULL << 30; + auto func = graph->compile(out_spec); + func->execute(); + for (size_t i = 0; i < grad_params_get.size(); ++i) MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); } @@ -238,6 +247,15 @@ TEST(TestSublinearMemory, ConcatSplit) { } } + for (size_t i = 0; i < grad_params_get.size(); ++i) + MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); + + graph->options().enable_sublinear_memory_opt = false; + graph->options().enable_dtr_memory_opt = true; + graph->options().dtr_config.eviction_threshold = 1ULL << 30; + auto func = graph->compile(out_spec); + func->execute(); + for (size_t i = 0; i < grad_params_get.size(); ++i) MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); } @@ -302,6 +320,15 @@ TEST(TestSublinearMemory, MultiOutputOpr) { } } + for (size_t i = 0; i < grad_params_get.size(); ++i) + MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); + + graph->options().enable_sublinear_memory_opt = false; + graph->options().enable_dtr_memory_opt = true; + graph->options().dtr_config.eviction_threshold = 1ULL << 30; + auto func = graph->compile(out_spec); + func->execute(); + for (size_t i = 0; i < grad_params_get.size(); ++i) MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3); } @@ -365,6 +392,15 @@ TEST(TestSublinearMemory, LongChain) { } } + for (size_t i = 0; i < grad_params_get.size(); ++i) + MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-4); + + graph->options().enable_sublinear_memory_opt = false; + graph->options().enable_dtr_memory_opt = true; + graph->options().dtr_config.eviction_threshold = 1ULL << 30; + auto func = graph->compile(out_spec); + func->execute(); + for (size_t i = 0; i < grad_params_get.size(); ++i) MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-4); }