diff --git a/imperative/src/impl/ops/utility.cpp b/imperative/src/impl/ops/utility.cpp index 73445bfe8135fc56c299b743ea13fd0917572657..56f81e48e39579aadf4f1a50f5d1ee5041e23d16 100644 --- a/imperative/src/impl/ops/utility.cpp +++ b/imperative/src/impl/ops/utility.cpp @@ -9,6 +9,8 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ + #include + #include "megbrain/imperative/ops/autogen.h" #include "megbrain/imperative/ops/utility.h" #include "megbrain/imperative/ops/opr_attr.h" @@ -277,23 +279,50 @@ struct ComputingGraphHolder { SmallVector> inputs; SmallVector> outputs; std::shared_ptr allocator; + SmallVector> events; }; -thread_local OpMethResultCache cg_cache; - ComputingGraphHolder& get_computing_graph(std::shared_ptr compiled_op, SmallVector descs) { - OpMethArgs<> key = {compiled_op, descs}; - auto& cg_holder = cg_cache[key]; - if (!cg_holder.graph) { + using ComputingGraphHolderCache = OpMethResultCache>>; + thread_local ComputingGraphHolderCache cache; + thread_local size_t nr_cg_holders = 0; + ComputingGraphHolderCache::key_t cache_key = {compiled_op, descs}; + auto& cg_holder_queue = cache[cache_key]; + std::unique_ptr holder; + if(!cg_holder_queue.empty()) { + // pick one + std::swap(cg_holder_queue.front(), holder); + // check all events finished + for (auto&& event: holder->events) { + if (!event->finished()) { + bool queue_limited = event->comp_node().contain_flag(CompNode::Flag::QUEUE_LIMITED); + bool many_graph = cg_holder_queue.size() > 10; + if (queue_limited || !many_graph) { + std::swap(cg_holder_queue.front(), holder); + break; + } else { + // graph limit + mgb_log_debug("computing graph limit for compiled op exceeded, waiting for prev graph"); + event->host_wait(); + } + } + } + if (holder) { + cg_holder_queue.pop(); + } + } + if (!holder) { + // create new computing graph + holder = std::make_unique(); + auto& cg_holder = *holder; cg_holder.allocator = std::make_shared(); cg_holder.graph = ComputingGraph::make(); cg_holder.graph->options().force_dynamic_alloc = true; cg_holder.graph->options().async_exec_level = 0; cg_holder.graph->options().graph_opt_level = compiled_op->cast_final_safe().gopt_level; cg_holder.graph->options().enable_var_mem_defragment = false; - cg_holder.graph->options().comp_seq_sync_device = false; + // set allocator for DTR support cg_holder.graph->set_device_memory_allocator(cg_holder.allocator); - // cg_holder.graph->options().graph_opt.jit = 2; VarNodeArray input_vars; for (auto&& desc: descs) { auto input_device_nd = std::make_shared(); @@ -321,8 +350,19 @@ ComputingGraphHolder& get_computing_graph(std::shared_ptr compiled_op, Sm cg_holder.outputs.push_back(output_ptr); } cg_holder.executable = cg_holder.graph->compile(output_spec); + CompNode::UnorderedSet comp_nodes; + for (auto&& output_var: output_vars) { + comp_nodes.insert(output_var->comp_node()); + } + for (auto&& comp_node: comp_nodes) { + cg_holder.events.push_back(comp_node.create_event()); + cg_holder.events.back()->record(); + } + nr_cg_holders++; + mgb_log_debug("add new computing graph for compiled op, now %zu graphs", nr_cg_holders); } - return cg_holder; + cg_holder_queue.push(std::move(holder)); + return *cg_holder_queue.back(); } auto apply_on_physical_tensor( @@ -335,13 +375,17 @@ auto apply_on_physical_tensor( size_t nr_inputs = inputs.size(); auto shared_def = const_cast(def).shared_from_this(); auto& cg_holder = get_computing_graph(shared_def, input_descs); + // wait for last execution + cg_holder.executable->wait(); for (size_t i = 0; i < nr_inputs; ++i) { auto input_dev_tensor = inputs[i]->dev_tensor(); cg_holder.inputs[i]->reset(input_dev_tensor.storage(), input_dev_tensor.layout()); } cg_holder.allocator->current_op = shared_def; cg_holder.executable->execute(); - cg_holder.executable->wait(); + for (auto&& event: cg_holder.events) { + event->record(); + } SmallVector outputs; for (auto input_nd: cg_holder.inputs) { *input_nd = {}; diff --git a/imperative/src/include/megbrain/imperative/graph_cache.h b/imperative/src/include/megbrain/imperative/graph_cache.h index 153e60ab711dad77ac8d6d106db7616f5a8ae6b7..d4b1eedd812cf2c4bace719cec3dc89eb5287503 100644 --- a/imperative/src/include/megbrain/imperative/graph_cache.h +++ b/imperative/src/include/megbrain/imperative/graph_cache.h @@ -18,6 +18,7 @@ namespace mgb { namespace imperative { +//NOTE: only input dtype and comp_node used for hashing, shapes are ignored template struct OpMethArgs { std::shared_ptr op;