diff --git a/imperative/python/megengine/core/tensor/utils.py b/imperative/python/megengine/core/tensor/utils.py index a3d764021b6b7b02ad78f4cee77fe60f4d840358..a0e2648eb354edba5e9f20ec4c567f12ee8e83fb 100644 --- a/imperative/python/megengine/core/tensor/utils.py +++ b/imperative/python/megengine/core/tensor/utils.py @@ -22,6 +22,7 @@ from .._imperative_rt.core2 import ( make_shape_tuple, ) from .._imperative_rt.ops import SubgraphBuilder as _SubgraphBuilder +from .._imperative_rt.ops import jit_supported from .._wrap import as_device from ..autodiff.grad import Function from ..ops import builtin @@ -234,6 +235,10 @@ def subgraph( gopt_level = None # disable jit and compile jit_fusion = False + if jit_fusion and not jit_supported: + jit_fusion = False # jit unusable, fallback to graph compile + gopt_level = 2 + def as_op(op, nargs): if isinstance(op, str): assert (op, nargs) in _opr_map, "unknown operator" diff --git a/imperative/python/src/ops.cpp b/imperative/python/src/ops.cpp index aea124e16cc0f62f46d17246ddeaa603eb83d231..3bc859aac746ee74701165931d60fba56ebeb1a7 100644 --- a/imperative/python/src/ops.cpp +++ b/imperative/python/src/ops.cpp @@ -652,6 +652,11 @@ void init_ops(py::module m) { }); m.def("set_jit_enabled", &JITFusionOp::set_enabled); + bool jit_supported = false; +#if MGB_JIT + jit_supported = true; +#endif + m.attr("jit_supported") = jit_supported; auto custom = submodule(m, "_custom"); init_custom(custom); diff --git a/imperative/src/impl/ops/utility.cpp b/imperative/src/impl/ops/utility.cpp index 07a434a3ffcc2af61df917d86a02c3b6be402d2b..8ac834f249c54fdb89bdab9aab5094ce3c33015a 100644 --- a/imperative/src/impl/ops/utility.cpp +++ b/imperative/src/impl/ops/utility.cpp @@ -9,21 +9,26 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#include +#include -#include "../op_trait.h" #include "megbrain/imperative/graph_cache.h" #include "megbrain/imperative/opr_utility.h" #include "megbrain/imperative/ops/autogen.h" #include "megbrain/imperative/ops/opr_attr.h" #include "megbrain/imperative/ops/utility.h" #include "megbrain/imperative/subgraph_detail.h" -#include "megbrain/jit/executor_opr.h" #include "megbrain/opr/io.h" #include "megbrain/opr/tensor_gen.h" #include "megbrain/opr/tensor_manip.h" #include "megbrain/opr/utility.h" +#if MGB_JIT +#include "megbrain/jit/executor_opr.h" +#endif + +#include "../event_pool.h" +#include "../op_trait.h" + namespace mgb::imperative { MGB_DYN_TYPE_OBJ_FINAL_IMPL(GenericPyOp); @@ -309,7 +314,7 @@ struct ComputingGraphHolder { SmallVector input_vars; SmallVector output_vars; std::shared_ptr allocator; - SmallVector> events; + SmallVector> events; std::unique_ptr updater; void initialize( @@ -402,7 +407,7 @@ struct ComputingGraphHolder { return true; }); for (auto&& comp_node : comp_nodes) { - events.push_back(comp_node.create_event()); + events.push_back(EventPool::without_timer().alloc_shared(comp_node)); events.back()->record(); } } @@ -510,7 +515,7 @@ ComputingGraphHolder& get_computing_graph( std::shared_ptr compiled_op, const SmallVector& descs) { using ComputingGraphHolderCache = - OpMethResultCache>>>; + OpMethResultCache>>>; thread_local auto cache = std::make_unique(); thread_local size_t nr_cg_holders = 0; typename ComputingGraphHolderCache::key_t cache_key = {compiled_op, descs}; @@ -540,20 +545,28 @@ ComputingGraphHolder& get_computing_graph( } } if (holder) { - cg_holder_queue.pop(); + cg_holder_queue.pop_front(); } } if (!holder) { // create new computing graph - holder = std::make_unique>(); - auto& cg_holder = *holder; - cg_holder.initialize(compiled_op->cast_final_safe(), descs); - nr_cg_holders++; - mgb_log_debug( - "add new computing graph for compiled op, now %zu graphs", - nr_cg_holders); + auto create_holder = [&] { + auto holder = std::make_unique>(); + auto& cg_holder = *holder; + cg_holder.initialize(compiled_op->cast_final_safe(), descs); + nr_cg_holders++; + mgb_log_debug( + "add new computing graph for compiled op, now %zu graphs", + nr_cg_holders); + return holder; + }; + size_t nr_graphs = std::max(cg_holder_queue.size(), (size_t)1); + for (size_t i = 1; i < nr_graphs; ++i) { + cg_holder_queue.push_front(create_holder()); + } + holder = create_holder(); } - cg_holder_queue.push(std::move(holder)); + cg_holder_queue.push_back(std::move(holder)); return *cg_holder_queue.back(); } @@ -670,6 +683,7 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { // skip for dump (JITExecutor can not be dumped) return outputs; } +#if MGB_JIT for (auto& output : outputs) { jit::InternalGraphGenerator igg{output->owner_opr()}; std::vector reverse_order; @@ -686,6 +700,9 @@ auto apply_on_var_node(const OpDef& def, const VarNodeArray& inputs) { auto ig = igg.generate(); output = jit::JITExecutor::make(ig, igg.orig_inps()).node(); } +#else + mgb_assert(false, "MGB_WITH_JIT was disabled"); +#endif return outputs; } diff --git a/src/jit/impl/nvrtc/compiler_cuda.cpp b/src/jit/impl/nvrtc/compiler_cuda.cpp index df5d8399b80989d08ca78f1d457c26d699f50c44..cc600f45abe1799ed7363ff4995f181496ff690f 100644 --- a/src/jit/impl/nvrtc/compiler_cuda.cpp +++ b/src/jit/impl/nvrtc/compiler_cuda.cpp @@ -216,11 +216,11 @@ void CudaExecutable::FuncCache::compile( ptx = NVRTCCompile(cuda_exe->m_source, major, minor); ptx_cache = PersistentCache::Blob{ptx.data(), ptx.size()}; cache.put(cache_category, key, ptx_cache.val()); + mgb_log("NVRTC JIT: compile %s for %d.%d: source_len=%zu ptx_len=%zu " + "time=%.3fms", + cuda_exe->m_name.c_str(), major, minor, key.size, ptx.size(), + timer.get_msecs()); } - mgb_log("NVRTC JIT: compile %s for %d.%d: source_len=%zu ptx_len=%zu " - "time=%.3fms", - cuda_exe->m_name.c_str(), major, minor, key.size, ptx.size(), - timer.get_msecs()); } void CudaExecutable::FuncCache::exec(