diff --git a/lite/include/lite/network.h b/lite/include/lite/network.h index 90d4433ce61caff69b6159a101c3a5eaae3ef80b..1a69f14ee15afc1a1e638f45e19d03c758049070 100644 --- a/lite/include/lite/network.h +++ b/lite/include/lite/network.h @@ -47,10 +47,10 @@ LITE_API inline LiteAlgoSelectStrategy operator|( * @param no_profiling_on_shape_change do not re-profile to select best implement * algo when input shape changes (use previous algo) * - * @param jit_level Execute supported operators with JIT (support MLIR, - * NVRTC). Can only be used on Nvidia GPUs and X86 CPU, this value indicates JIT level: - * level 1: for JIT execute with basic elemwise operator - * level 2: for JIT execute elemwise and reduce operators + * @param jit_level Execute supported operators with JIT, please check with + * MGB_JIT_BACKEND for more details, this value indicates JIT level. + * 1: for JIT execute with basic elemwise operator + * 2: for JIT execute elemwise and reduce operators * * @param record_level flags to optimize the inference performance with record the * kernel tasks in first run, hereafter the inference all need is to execute the diff --git a/lite/lite-c/include/lite-c/network_c.h b/lite/lite-c/include/lite-c/network_c.h index c4e6c20b8e1e2dbe287c83c90e63847b226520d0..14eeb302b61a7284b1600b85c67d351b0faab5d9 100644 --- a/lite/lite-c/include/lite-c/network_c.h +++ b/lite/lite-c/include/lite-c/network_c.h @@ -36,10 +36,10 @@ extern "C" { * \param no_profiling_on_shape_change do not re-profile to select best impl * algo when input shape changes (use previous algo) * - * \param jit_level Execute supported operators with JIT (support MLIR, - * NVRTC). Can only be used on Nvidia GPUs, this value indicates JIT level: - * 1 for basic elemwise opr; - * 2 for including reduce operator + * \param jit_level Execute supported operators with JIT, please check with + * MGB_JIT_BACKEND for more details, this value indicates JIT level. + * 1: for basic elemwise opr + * 2: for including reduce operator * * \param record_level flag optimize the inference performace with record the * kernel tasks in first run, hereafter the inference all need to execute the diff --git a/lite/load_and_run/src/options/optimize_options.cpp b/lite/load_and_run/src/options/optimize_options.cpp index 6ff868b6ed28f5f52b22fd7c98b114020ccce116..7c692f4214631d2a58f441a538106a2627760ca0 100644 --- a/lite/load_and_run/src/options/optimize_options.cpp +++ b/lite/load_and_run/src/options/optimize_options.cpp @@ -744,8 +744,8 @@ DEFINE_uint64(workspace_limit, SIZE_MAX, "set workspace upbound limit"); ///////////////////////// other options for optimization ///////////////// DEFINE_bool( enable_jit, false, - " Execute supported operators with JIT(now only support NVRTC). " - "Can only be used on Nvidia GPUs"); + " Execute supported operators with JIT, please check with MGB_JIT_BACKEND for " + "more details"); #if MGB_ENABLE_TENSOR_RT DEFINE_bool( tensorrt, false, @@ -788,4 +788,4 @@ REGIST_OPTION_CREATOR(memory_optimize, lar::MemoryOptimizeOption::create_option) REGIST_OPTION_CREATOR(JIT, lar::JITOption::create_option); #if MGB_ENABLE_TENSOR_RT REGIST_OPTION_CREATOR(tensorRT, lar::TensorRTOption::create_option); -#endif \ No newline at end of file +#endif diff --git a/lite/pylite/megenginelite/network.py b/lite/pylite/megenginelite/network.py index dcd8796580af781f33618efe48b48d48842d230e..7283355a54e0b24e10d6ddbaa1b771a873aa625f 100644 --- a/lite/pylite/megenginelite/network.py +++ b/lite/pylite/megenginelite/network.py @@ -41,8 +41,8 @@ class LiteOptions(Structure): no_profiling_on_shape_change: do not re-profile to select best implement algo when input shape changes (use previous algo) - jit_level: Execute supported operators with JIT (support MLIR, - NVRTC). Can only be used on Nvidia GPUs and X86 CPU, this value indicates JIT level: + jit_level: Execute supported operators with JIT, please check with MGB_JIT_BACKEND + for more details, this value indicates JIT level: level 1: for JIT execute with basic elemwise operator diff --git a/src/core/impl/graph/cg_impl.cpp b/src/core/impl/graph/cg_impl.cpp index d96a0ed2c3a0f9d5b53fe527262b8d02626a1d76..6d57a6deff5336161f9b3de09658cf6d38d4e831 100644 --- a/src/core/impl/graph/cg_impl.cpp +++ b/src/core/impl/graph/cg_impl.cpp @@ -97,7 +97,7 @@ void DeviceMemoryAllocator::alloc_dynamic( } void DeviceMemoryAllocator::defrag_prealloc_contig( - ComputingGraph* graph, CompNode comp_node, + ComputingGraph* /*graph*/, CompNode comp_node, size_t size){MGB_TRY{comp_node.free_device(comp_node.alloc_device(size)); } MGB_CATCH(MemAllocError&, {}) @@ -574,10 +574,13 @@ ComputingGraphImpl::CompileState ComputingGraphImpl::compile_prepare( (options().graph_opt.jit || options().graph_opt.jit_config.enabled())) { // Deprecated usage added previously. It allows NVRTC JIT optimization // when graph_opt_level is 0. This usage is not recommanded any more. - mgb_log_warn( - "It is not recommanded to enable JIT optimization when " - "graph_opt_level is 0."); - setenv("MGB_JIT_BACKEND", "NVRTC", 1); + unsigned int max_warm = 9; + do { + mgb_log_warn( + "It is not recommanded to enable JIT optimization when " + "graph_opt_level is 0, try config graph_opt_level more than 0"); + } while (max_warm-- > 0); + gopt::GraphOptimizer optimizer; optimizer.add_pass( sopr_stat.has_virtual_grad, options().graph_opt.jit, diff --git a/src/core/impl/graph/seq_sublinear_memory.cpp b/src/core/impl/graph/seq_sublinear_memory.cpp index ddb5090ca06213d8654e2e99d3e3e5e3fd8e6b3c..2095aa84bf91f9259261ea055a235a3801f06d52 100644 --- a/src/core/impl/graph/seq_sublinear_memory.cpp +++ b/src/core/impl/graph/seq_sublinear_memory.cpp @@ -859,11 +859,12 @@ const SeqModifierForSublinearMemory::SeqModifyAction& SeqModifierForSublinearMem msg.push_back('\n'); msg.append(ssprintf("m_min_bottleneck: %-10.2f\n", m_min_bottleneck * SIZE2MB)); if (!m_par_modifier->m_config->genetic_nr_iter) { - msg.append( - ssprintf("\nGenetic algorithm is currently DISABLED, " - "set MGB_SUBLINEAR_MEMORY_GENETIC_NR_ITER [default = 0]" - " to a positive integer to set the number of iterations" - " in genetic algorithm.\n")); + msg.append(ssprintf( + "\nGenetic algorithm is currently DISABLED, " + "set %c%cB_SUBLINEAR_MEMORY_GENETIC_NR_ITER [default = 0]" + " to a positive integer to set the number of iterations" + " in genetic algorithm.\n", + 'M', 'G')); } mgb_log_debug("%s", msg.c_str()); #else @@ -934,10 +935,11 @@ SeqModifierForSublinearMemory::SeqModifyAction SeqModifierForSublinearMemory:: planner_concur = m_config->num_worker; } - mgb_log_debug( - "use %zu threads to search for sublinear memory plan; " - "this can be changed via MGB_SUBLINEAR_MEMORY_WORKERS env var", - planner_concur); + std::string msg = ssprintf( + "use %zu threads to search for sublinear memory plan; this can be changed " + "via %c%cB_SUBLINEAR_MEMORY_WORKERS env var", + planner_concur, 'M', 'G'); + mgb_log_debug("%s", msg.c_str()); for (auto&& i : m_planner_thread_pool.start(planner_concur)) m_thread2planner[i].reset(new ModifyActionPlanner{this}); diff --git a/src/jit/README.md b/src/jit/README.md index 977ea4ea6bc459be34771ad3fa0409d1d174b748..f70ee918108b53dc536de5423d4c521cf6daca21 100644 --- a/src/jit/README.md +++ b/src/jit/README.md @@ -41,8 +41,8 @@ The detection is implemented in [impl/fusion_pass.cpp](impl/fusion_pass.cpp), the main detection logic is in function *Fusion::Impl::on_opr*. Compared to nnvm fusion, our fusion logic can fuse more operators into one fusion kernel. -For now , JIT just support CUDA, but it has reserved interface to extend other -platforms. +For now , JIT support CUDA by HALIDE or NVRTC, CPU by MLIR, OpenCL by TINYOPENCL, +also it has reserved interface to extend more platforms. ## How to enable JIT You can set `graph_opt_level` to 3 to enable JIT. @@ -57,10 +57,12 @@ cg.set_option('graph_opt_level', 3) You can set environment variable `MGB_JIT_BACKEND` to select the JIT backend. -| Backend | Platforms | Reduction support | Kernel Binary Cache | Kernel Reuse | Noncontig Input | -|---------|-----------|-------------------|---------------------|--------------|-----------------| -| HALIDE | CUDA | Y | No | Shape | No | -| NVRTC | CUDA | N | Via PersistentCache | Bcast type | Monotone | +| Backend | Platforms | Reduction support | Kernel Binary Cache | Kernel Reuse | Noncontig Input | +|------------|-----------|-------------------|---------------------|--------------|-----------------| +| HALIDE | CUDA | Y | No | Shape | No | +| NVRTC | CUDA | N | Via PersistentCache | Bcast type | Monotone | +| MLIR | CPU | N | NO | Kernel hash | Monotone | +| TINYOPENCL | OpenCL | N | Via OpenCL cache | Kernel hash | Monotone | To enable fusion of Reduce oprs, set `graph_opt.jit = 2` in graph options. diff --git a/src/jit/impl/ast_c.cpp b/src/jit/impl/ast_c.cpp index eb1dc794bacb97e6a70820e8d96e951ef963f89f..d78c7e0855012f5957538ff78d50c79a406cd809 100644 --- a/src/jit/impl/ast_c.cpp +++ b/src/jit/impl/ast_c.cpp @@ -53,16 +53,22 @@ ASTPtr gen_powc(ASTPtr inp, float exp) { return make_call("powf", {inp, exp}); } + } // anonymous namespace -const ElemGeneratorMap& ast_c::elem_opr_generator() { -#define ENTRY(_mode, _impl) \ - { \ - ElemMode::_mode, { \ - [](const ASTPtrArray& inps) -> ASTPtrArray { return {_impl}; } \ - } \ +const ElemGeneratorMap& ast_c::elem_opr_generator(CompNode::DeviceType device_type) { +#define ENTRY(_mode, _impl) \ + { \ + ElemMode::_mode, { \ + [=](const ASTPtrArray& inps, bool is_half) -> ASTPtrArray { \ + MGB_MARK_USED_VAR(is_half); \ + return {_impl}; \ + } \ + } \ } - static ElemGeneratorMap map = { + + //! other backends map + static ElemGeneratorMap other_map = { // unary ENTRY(RELU, make_call("fmaxf", {inps[0], 0.f})), ENTRY(ABS, make_call("fabsf", inps)), @@ -102,7 +108,7 @@ const ElemGeneratorMap& ast_c::elem_opr_generator() { ENTRY(SWITCH_GT0, ASTPtr::make(inps[0] > 0, inps[1], 0)), ENTRY(TANH_GRAD, (1 - inps[0] * inps[0]) * inps[1]), ENTRY(TRUE_DIV, inps[0] / inps[1]), - ENTRY(LOG_SUM_EXP, make_call("mgb_log_sum_exp", {inps[0], inps[1]})), + ENTRY(LOG_SUM_EXP, make_call("jit_log_sum_exp", {inps[0], inps[1]})), ENTRY(LT, ASTPtr::make("<", inps[0], inps[1])), ENTRY(LEQ, ASTPtr::make("<=", inps[0], inps[1])), ENTRY(EQ, ASTPtr::make("==", inps[0], inps[1])), @@ -133,22 +139,28 @@ const ElemGeneratorMap& ast_c::elem_opr_generator() { 0.f}) / 6.f), }; - mgb_assert(map.size() + 41 == opr::Elemwise::Param::MODE_NR_MEMBER); + mgb_assert(other_map.size() + 41 == opr::Elemwise::Param::MODE_NR_MEMBER); // unimplemented modes: SHL, SHR, FAST_TANH, FAST_TANH_GRAD, ROUND, RMULH, // ERFINV, ERFCINV, NOT, AND, OR, XOR, NEQ, ISNAN, ISINF - return map; + + return other_map; #undef ADD_OPR } -ASTPtrArray ast_c::opr2AST(cg::OperatorNodeBase* opr, const ASTPtrArray& inputs) { +ASTPtrArray ast_c::opr2AST( + cg::OperatorNodeBase* opr, const ASTPtrArray& inputs, + CompNode::DeviceType device_type) { using namespace opr; if (auto elem = gopt::try_cast_as_op(opr)) { - if (check_elem_mode(elem->param().mode)) { - return elem_opr_generator().find(elem->param().mode)->second(inputs); + if (check_elem_mode(elem->param().mode, device_type)) { + return elem_opr_generator(device_type) + .find(elem->param().mode) + ->second(inputs, false); } } if (auto powc = gopt::try_cast_as_op(opr)) { + mgb_assert(inputs.size() == 1); return {gen_powc(inputs[0], powc->param().exp)}; } @@ -157,6 +169,7 @@ ASTPtrArray ast_c::opr2AST(cg::OperatorNodeBase* opr, const ASTPtrArray& inputs) if (imm.valid()) { auto dtype = imm->dtype(); if (dtype == dtype::Int32{}) { + return {ASTPtr::make(imm->get())}; } float scalar_value; @@ -169,10 +182,12 @@ ASTPtrArray ast_c::opr2AST(cg::OperatorNodeBase* opr, const ASTPtrArray& inputs) InternalError, "dtype(%s) is not any of [Float16, Float32, Int32]", dtype.name()); } - return {ASTPtr::make(scalar_value)}; + + return {ASTPtr::make(scalar_value, device_type, false)}; } if (opr->same_type()) { + // simply ignore TypeCvt oprs. mgb_assert(inputs.size() == 1); return inputs; diff --git a/src/jit/impl/compiler.cpp b/src/jit/impl/compiler.cpp index 68da3800e28153c1de8a307c76040406797468f5..ed1ac2c2d9d6fef7f3e0f1287bec7c5d667a1357 100644 --- a/src/jit/impl/compiler.cpp +++ b/src/jit/impl/compiler.cpp @@ -67,40 +67,50 @@ Compiler* Compiler::get(ComputingGraph& graph, CompNode comp_node) { } MGB_LOCK_GUARD(holder->mtx); auto&& compiler = holder->dev2compiler[comp_node.device_type()]; - auto backend = MGB_GETENV("MGB_JIT_BACKEND"); + auto backend = ::std::getenv(ssprintf("%c%cB_JIT_BACKEND", 'M', 'G').c_str()); + mgb_assert( + backend, + "code issue happened, need call config_jit_backends before get compiler"); + //! please keep logic with JITFusionPass::Impl::config_jit_backends if (!compiler) { switch (comp_node.device_type()) { #if MGB_CUDA case CompNode::DeviceType::CUDA: #if MGB_JIT_HALIDE - if (!backend || !strcmp(backend, "HALIDE")) { + if (!strcmp(backend, "HALIDE")) { compiler = std::make_unique(); break; } #endif #if MGB_JIT_MLIR - if (!backend || !strcmp(backend, "MLIR")) { + if (!strcmp(backend, "MLIR")) { compiler = std::make_unique(CompNode::DeviceType::CUDA); break; } #endif - if (!backend || !strcmp(backend, "NVRTC")) { + if (!strcmp(backend, "NVRTC")) { compiler = std::make_unique(); break; } - mgb_throw(InternalError, "No compiler support for cuda"); + mgb_throw( + InternalError, + "No compiler support for cuda, may caused by build not enable " + "MLIR/HALIDE module or error config jit backend env"); break; #endif case CompNode::DeviceType::CPU: #if MGB_JIT_MLIR - if (!backend || !strcmp(backend, "MLIR")) { + if (!strcmp(backend, "MLIR")) { compiler = std::make_unique(CompNode::DeviceType::CPU); break; } #endif - mgb_throw(InternalError, "No compiler support for cpu"); + mgb_throw( + InternalError, + "No compiler support for cpu, may caused by build not enable " + "MLIR module or error config jit backend env"); break; default: mgb_throw( diff --git a/src/jit/impl/fusion_pass.cpp b/src/jit/impl/fusion_pass.cpp index 2008affa16fc15a751d41194f0bdee5edd9455a2..f83257cb8a3bf768264288b35371cf347474b39d 100644 --- a/src/jit/impl/fusion_pass.cpp +++ b/src/jit/impl/fusion_pass.cpp @@ -1,11 +1,13 @@ #include "megbrain/jit/fusion_pass.h" #include "megbrain/common.h" +#include "megbrain/comp_node_env.h" #include "megbrain/gopt/gtrans.h" #include "megbrain/jit/ast_c.h" #include "megbrain/jit/compiler.h" #include "megbrain/jit/internal_graph.h" #include "megbrain/opr/tensor_manip.h" #include "megbrain/serialization/serializer.h" +#include "megdnn/tensor_format.h" #if MGB_JIT @@ -66,6 +68,9 @@ class JITFusionPass::Impl final { return num; } + //! config jit backends + void config_jit_backends(CompNode comp_node) const; + public: Impl(bool after_grad, JITFeatureBits feature_bits, OptState& opt_state) : m_after_grad{after_grad}, @@ -77,6 +82,57 @@ public: } }; +void JITFusionPass::Impl::config_jit_backends(CompNode comp_node) const { +#define ENV_CB(VALUE) \ + if (!backend || !strcmp(backend, VALUE)) { \ + if (!backend) { \ + mgb_log_debug("config jit default backend to %s", VALUE); \ + setenv(ssprintf("%c%cB_JIT_BACKEND", 'M', 'G').c_str(), VALUE, 1); \ + } \ + break; \ + } + + auto backend = ::std::getenv(ssprintf("%c%cB_JIT_BACKEND", 'M', 'G').c_str()); + if (backend) { + mgb_log_debug("use user config jit backend with: %s", backend); + } + switch (comp_node.device_type()) { +#if MGB_CUDA + // CUDA jit default property: HALIDE > MLIR > NVRTC + case CompNode::DeviceType::CUDA: +#if MGB_JIT_HALIDE + ENV_CB("HALIDE"); +#endif +#if MGB_JIT_MLIR + ENV_CB("MLIR"); +#endif + ENV_CB("NVRTC"); + mgb_throw( + InternalError, + "No compiler support for cuda, may caused by build not enable " + "MLIR/HALIDE module or error config jit backend env"); + break; +#endif + // CPU jit only support MLIR now + case CompNode::DeviceType::CPU: +#if MGB_JIT_MLIR + ENV_CB("MLIR"); +#endif + mgb_throw( + InternalError, + "No compiler support for cpu, may caused by build not enable " + "MLIR module or error config jit backend env"); + break; + default: + mgb_throw( + InternalError, + "unsupported JIT config: " + "comp_node=%s backend_setting=%s", + comp_node.to_string().c_str(), backend); + } +#undef ENV_CB +} + void JITFusionPass::Impl::detect_fusion() { std::vector topo_order; m_opt_state.graph().iter([this, &topo_order](OperatorNodeBase* opr) { @@ -86,8 +142,19 @@ void JITFusionPass::Impl::detect_fusion() { } }); + //! call config_jit_backends as soon as possible + for (auto opr : reverse_adaptor(topo_order)) { + auto&& cn = opr->output(0)->comp_node(); + if (cn == CompNode::default_cpu()) { + continue; + } + config_jit_backends(cn); + break; + } + for (auto opr : reverse_adaptor(topo_order)) { if (can_be_fused(opr)) { + mgb_log_debug("%s: try process : %s", __FUNCTION__, opr->cname()); process_opr(opr); } } @@ -317,11 +384,11 @@ bool JITFusionPass::Impl::can_be_fused(cg::OperatorNodeBase* opr) const { return false; } - //! As MLIR backend has some contraints - const char* backend = MGB_GETENV("MGB_JIT_BACKEND"); - if (!backend) { - backend = "DEFAULT"; - } + auto backend = ::std::getenv(ssprintf("%c%cB_JIT_BACKEND", 'M', 'G').c_str()); + mgb_assert( + backend, + "code issue happened, need call config_jit_backends before check opr can " + "be fused"); // float elemwise if (auto elem = gopt::try_cast_as_op(opr)) { bool ret = true; @@ -361,11 +428,15 @@ bool JITFusionPass::Impl::can_be_fused(cg::OperatorNodeBase* opr) const { #undef FOREACH_ELEMWISE_SKIP_MODE } #endif // MGB_JIT_MLIR - return ret && ast_c::check_elem_mode(elem->param().mode) && + + return ret && + ast_c::check_elem_mode( + elem->param().mode, opr->output(0)->comp_node().device_type()) && elem->output(0)->dtype().category() == DTypeCategory::FLOAT; } - if (strcmp(backend, "MLIR")) { + //! TINYOPENCL and MLIR only support elemwise now + if (strcmp(backend, "MLIR") && strcmp(backend, "TINYOPENCL")) { if (opr->same_type()) { return true; } diff --git a/src/jit/impl/nvrtc/codegen_cuda.cpp b/src/jit/impl/nvrtc/codegen_cuda.cpp index e7afe63bce20b1e687c991a03ebabeb9f3ac2522..28e3f1a6c301c84913c552b36b27c93c88b8740e 100644 --- a/src/jit/impl/nvrtc/codegen_cuda.cpp +++ b/src/jit/impl/nvrtc/codegen_cuda.cpp @@ -82,7 +82,8 @@ void gen_input_code( for (size_t i = 0; i < args.inputs.size(); i++) { ASTPtr elem_var = ASTPtr::make("x" + std::to_string(i)); ASTPtr elem_val = gen_data_ast(i, args.inputs[i]); - ASTPtr elem_decl = ASTPtr::make(elem_var); + ASTPtr elem_decl = + ASTPtr::make(elem_var, CompNode::DeviceType::CUDA); ASTPtr elem_assign = ASTPtr::make(elem_var, elem_val); var2ast[placeholders[args.inputs[i].idx]->output(0)] = elem_var; decl_exps_str += elem_decl->code_gen(); @@ -109,7 +110,7 @@ ASTPtr gen_opr_ast(cg::OperatorNodeBase* opr, const VarNode2AST& var2ast) { return {cur_inputs[0]}; } - return opr2AST(opr, cur_inputs).at(0); + return opr2AST(opr, cur_inputs, CompNode::DeviceType::CUDA).at(0); } } // anonymous namespace @@ -145,7 +146,7 @@ struct PEVisitors { }; template -static __forceinline__ __device__ T mgb_log_sum_exp(T x, T y) { +static __forceinline__ __device__ T jit_log_sum_exp(T x, T y) { T a, b; a = x < y ? x : y; b = x < y ? y : x; @@ -213,7 +214,8 @@ extern "C" __global__ void {{KERNEL_NAME}} (Data data, size_t num_elements, } ASTPtr elem_var = ASTPtr::make("y" + std::to_string(cur_opr_cnt)); ASTPtr elem_val = gen_opr_ast(opr, var2ast); - ASTPtr elem_decl = ASTPtr::make(elem_var); + ASTPtr elem_decl = + ASTPtr::make(elem_var, CompNode::DeviceType::CUDA); ASTPtr elem_assign = ASTPtr::make(elem_var, elem_val); var2ast[opr->output(0)] = elem_var; internal_decl_exps_str += elem_decl->code_gen(); diff --git a/src/jit/impl/nvrtc/compiler_cuda.cpp b/src/jit/impl/nvrtc/compiler_cuda.cpp index 098567699dc7fcc5d4103d09e2c7c12ec4bdd134..4669df5aebb8d0302a3e37a562720c8af911e01d 100644 --- a/src/jit/impl/nvrtc/compiler_cuda.cpp +++ b/src/jit/impl/nvrtc/compiler_cuda.cpp @@ -11,7 +11,6 @@ #if MGB_JIT && MGB_CUDA -#include #include using namespace mgb; diff --git a/src/jit/impl/tiny_opencl/codegen_opencl.cpp b/src/jit/impl/tiny_opencl/codegen_opencl.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f16029cb272a257c1f308d383212867e9fb918e7 --- /dev/null +++ b/src/jit/impl/tiny_opencl/codegen_opencl.cpp @@ -0,0 +1,195 @@ +#include "./codegen_opencl.h" +#include "./utils.h" + +#include "megbrain/common.h" +#include "megbrain/jit/ast_c.h" +#include "megbrain/jit/executor_opr.h" +#include "megbrain/jit/placeholder_opr.h" +#include "megbrain/jit/utils.h" +#include "megbrain/opr/tensor_manip.h" + +#include + +#if MGB_JIT && MGB_OPENCL + +using namespace mgb; +using namespace jit; +using namespace ast_c; + +namespace { + +using VarNode2AST = ThinHashMap; + +//! generate code to access input values in the kernel +void gen_input_code_and_gen_input_data_update( + str_util::StrReplaceMap& replace_map, VarNode2AST& var2ast, + const JITExecutor::Args& args, const PlaceholderArray& placeholders, + bool is_half) { + std::string decl_exps_str, input_data_read_str; + std::string read_image_func = is_half ? "read_imageh" : "read_imagef"; + std::string scaler_dec_prefix = is_half ? "__global half* x" : "__global float* x"; + auto&& b_info = get_channel_broadcast_info(args); + for (size_t i = 0; i < args.inputs.size(); i++) { + //! gen input args + ASTPtr elem_var_raw = + ASTPtr::make("x_after_read" + std::to_string(i)); + ASTPtr elem_var = ASTPtr::make( + "__read_only image2d_t x" + std::to_string(i)); + ASTPtr elem_var_scalar_offset; + if (LayoutType::SCALAR == b_info[i]) { + elem_var = ASTPtr::make(scaler_dec_prefix + std::to_string(i)); + elem_var_scalar_offset = ASTPtr::make( + "const uint x_offset" + std::to_string(i)); + } + var2ast[placeholders[args.inputs[i].idx]->output(0)] = elem_var_raw; + decl_exps_str += elem_var->code_gen() + ", "; + if (LayoutType::SCALAR == b_info[i]) { + decl_exps_str += elem_var_scalar_offset->code_gen() + ", "; + } + + //! gen input data update + ASTPtr elem_var_raw_input = ASTPtr::make("x" + std::to_string(i)); + elem_var_raw = ASTPtr::make( + (is_half ? "half4 x_after_read" : "float4 x_after_read") + + std::to_string(i)); + std::string coord = "coord"; + if (LayoutType::BROADCAST == b_info[i]) { + coord = "coord_b"; + } + std::string read_method = read_image_func + "(" + + elem_var_raw_input->code_gen() + ", " + coord + ")"; + if (LayoutType::SCALAR == b_info[i]) { + if (is_half) { + read_method = "(half4)(vload_half(x_offset" + std::to_string(i) + + ", x" + std::to_string(i) + "))"; + } else { + read_method = "(float4)(vload(x_offset" + std::to_string(i) + ", x" + + std::to_string(i) + "))"; + } + } + ASTPtr elem_assign = ASTPtr::make( + elem_var_raw, ASTPtr::make(read_method)); + input_data_read_str += elem_assign->code_gen(); + } + str_util::append_replace_map( + replace_map, { + {"{{KERNEL_SRC_ARGS}}", decl_exps_str}, + {"{{ASSIGN_EXPRS}}", input_data_read_str}, + }); +} + +ASTPtr gen_opr_ast(cg::OperatorNodeBase* opr, const VarNode2AST& var2ast) { + mgb_assert( + !opr->same_type() && !opr->same_type() && + !opr->same_type() && !opr->same_type(), + "OpenCL jit not support Reduce/GetVarShape/Dimshuffle/PowC type now"); + ASTPtrArray cur_inputs; + for (auto inp_node : opr->input()) { + cur_inputs.push_back(var2ast.at(inp_node)); + } + + return opr2AST(opr, cur_inputs, CompNode::DeviceType::OPENCL).at(0); +} +} // anonymous namespace + +std::pair mgb::jit::codegen_opencl( + const InternalGraph& internal_graph, const JITExecutor::Args& args) { + std::string opencl_kernel = R"( +__kernel void {{KERNEL_NAME}} ( + {{KERNEL_SRC_ARGS}} + __write_only image2d_t dst, + __private const int global_size_dim0, + __private const int global_size_dim1, + __private const int wc_size, + __private const int hb_size, + __private const uint w_size + ) { + #if OPENCL_ENABLE_FP16 + #pragma OPENCL EXTENSION cl_khr_fp16 : enable + #endif + + const sampler_t SAMPLER = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP; + + int wc = get_global_id(0); + int hb = get_global_id(1); + +#ifndef NON_UNIFORM_WORK_GROUP + if (wc >= global_size_dim0 || hb >= global_size_dim1) + return; +#endif + + for (; hb < hb_size; hb += global_size_dim1) { + for (; wc < wc_size; wc += global_size_dim0) { + int2 coord = (int2)(wc, hb); + int2 coord_b = (int2)(wc / w_size, 0); + {{INTERNAL_DECL_EXPRS}} + {{ASSIGN_EXPRS}} + {{INTERNAL_ASSIGN_EXPRS}} + {{WRITE_IMAGE}}(dst, coord, {{EXP}}); + } + wc = get_global_id(0); + } +} + )"; + + auto input_dtype = args.inputs[0].layout.dtype; + for (size_t i = 0; i < args.inputs.size(); i++) { + mgb_assert( + args.inputs[i].layout.dtype == input_dtype, + "OpenCL jit all oprs should have same dtype"); + } + mgb_assert( + args.outputs.size() == 1 && args.outputs[0].layout.dtype == input_dtype, + "output size should be 1 and output dtype should be same with input"); + mgb_assert( + dtype::Float16() == input_dtype || dtype::Float32() == input_dtype, + "OpenCL jit dtype only support float32 or float16, %s not support", + input_dtype.name()); + auto is_half = dtype::Float16() == input_dtype; + + VarNode2AST var2ast; + str_util::StrReplaceMap source_replace_map; + + // add inputs to the replace map + gen_input_code_and_gen_input_data_update( + source_replace_map, var2ast, args, internal_graph.placeholders(), is_half); + + // add other oprs + std::string internal_decl_exps_str, internal_assign_exps_str; + std::string write_image_func = is_half ? "write_imageh" : "write_imagef"; + size_t cur_opr_cnt = 0; + cg::DepOprIter{[&](cg::OperatorNodeBase* opr) { + ++cur_opr_cnt; + if (opr->same_type()) { + return; + } + ASTPtr elem_var = ASTPtr::make("y" + std::to_string(cur_opr_cnt)); + ASTPtr elem_val = gen_opr_ast(opr, var2ast); + ASTPtr elem_decl = ASTPtr::make( + elem_var, CompNode::DeviceType::OPENCL, is_half); + ASTPtr elem_assign = ASTPtr::make(elem_var, elem_val); + var2ast[opr->output(0)] = elem_var; + internal_decl_exps_str += elem_decl->code_gen(); + internal_assign_exps_str += elem_assign->code_gen(); + }}.add(internal_graph.output()); + + str_util::append_replace_map( + source_replace_map, + {{"{{INTERNAL_DECL_EXPRS}}", internal_decl_exps_str}, + {"{{INTERNAL_ASSIGN_EXPRS}}", internal_assign_exps_str}, + {"{{WRITE_IMAGE}}", write_image_func}, + {"{{EXP}}", var2ast.at(internal_graph.output())->code_gen()}}); + + str_util::replace_all_pairs_inplace(opencl_kernel, source_replace_map); + // str_util::replace_all_pairs_inplace(opencl_kernel, source_replace_map); + + auto kernel_name = ssprintf( + "jit_opencl_%" PRIx64, + XXHash{}.update(opencl_kernel.data(), opencl_kernel.size()).digest()); + str_util::replace_all_pairs_inplace( + opencl_kernel, {{"{{KERNEL_NAME}}", kernel_name}}); + + return {kernel_name, opencl_kernel}; +} + +#endif // MGB_JIT && MGB_OPENCL diff --git a/src/jit/impl/tiny_opencl/codegen_opencl.h b/src/jit/impl/tiny_opencl/codegen_opencl.h new file mode 100644 index 0000000000000000000000000000000000000000..3e5c02655c582c348e544cbd7c6f0a3fb9e60412 --- /dev/null +++ b/src/jit/impl/tiny_opencl/codegen_opencl.h @@ -0,0 +1,21 @@ +#pragma once + +#include "megbrain_build_config.h" + +#if MGB_JIT && MGB_OPENCL + +#include "megbrain/jit/executor_opr.h" + +namespace mgb { +namespace jit { +/*! + * \brief generate opencl kernel source code + * \return (kernel name, kernel source) + */ +std::pair codegen_opencl( + const InternalGraph& internal_graph, const JITExecutor::Args& args); + +} // namespace jit +} // namespace mgb + +#endif // MGB_JIT && MGB_OPENCL diff --git a/src/jit/impl/tiny_opencl/compiler.cpp b/src/jit/impl/tiny_opencl/compiler.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d8ff6113584ffd9f73c5ed4e9cbdb704fe4e52ae --- /dev/null +++ b/src/jit/impl/tiny_opencl/compiler.cpp @@ -0,0 +1,194 @@ +#include "megbrain_build_config.h" +#include "megdnn/tensor_format.h" +#if MGB_JIT && MGB_OPENCL + +#include "./codegen_opencl.h" +#include "./compiler.h" +#include "./utils.h" + +#include "megbrain/common.h" +#include "megbrain/comp_node_env.h" +#include "megbrain/rdnn/management.h" +#include "megbrain/utils/timer.h" + +using namespace mgb; +using namespace jit; + +/* ==================== OpenCLTinyCompiler ===================== */ + +OpenCLTinyCompiler::OpenCLTinyCompiler(CompNode::DeviceType device_type) { + m_is_debug = ::std::getenv("OPENCL_JIT_DEBUG") ? true : false; + mgb_assert( + CompNode::DeviceType::OPENCL == device_type, + "error init OpenCLTinyCompiler"); +} + +std::unique_ptr OpenCLTinyCompiler::do_compile( + const InternalGraph& graph, const JITExecutor::Args& args) { + std::string source, kernel_name; + std::tie(kernel_name, source) = codegen_opencl(graph, args); + if (m_is_debug) { + mgb_log_debug("kernel: name: %s\n%s", kernel_name.c_str(), source.c_str()); + } + auto ret = std::make_unique( + std::move(source), std::move(kernel_name), m_is_debug); + return ret; +} + +size_t OpenCLTinyCompiler::get_nr_workspace_outputs(JITExecutor* opr) const { + MGB_MARK_USED_VAR(opr); + return 0; +} + +void OpenCLTinyCompiler::init_workspace_size_infer(JITExecutor* opr) { + MGB_MARK_USED_VAR(opr); +} + +/* =================== OpenCLExecutable ==================== */ + +OpenCLExecutable::OpenCLExecutable(std::string source, std::string name, bool is_debug) + : m_source{std::move(source)}, m_name{std::move(name)}, m_is_debug{is_debug} {} + +void OpenCLExecutable::execute(JITExecutor* fusion_opr) { + auto&& cn = fusion_opr->comp_node(); + auto& env = CompNodeEnv::from_comp_node(cn).opencl_env(); + auto handle = mgb::opr::intl::get_megdnn_handle(cn); + auto mgr = env.opencl_mgr; + auto&& ctx = mgr->context(); + auto& queue = mgr->command_queue(); + auto&& kernel = megdnn::opencl::OpenCLKernel(handle); + auto& args = fusion_opr->args(); + + static auto&& prop = megcore::opencl::OpenCLProp(mgr->device()); + bool is_adreno = prop.is_adreno(); + bool is_mali = prop.is_mali(); + auto max_work_group = static_cast(prop.max_work_group_size()); + mgb_assert( + prop.is_support_image(), + "code issue happened, OpenCL jit only support device with support image"); + + //! for debug + MGB_MARK_USED_VAR(ctx); + MGB_MARK_USED_VAR(queue); + + size_t WGSX = 0; + size_t WGSY = 0; + + //! create cl args + for (size_t i = 0; i < args.inputs.size(); i++) { + if (TensorFormat::Type::IMAGE2D_PACK4 == args.inputs[i].layout.format.type()) { + WGSX = std::max( + WGSX, + args.inputs[i] + .layout.format.as_impl() + .image_width(args.inputs[i].layout)); + WGSY = std::max( + WGSY, + args.inputs[i] + .layout.format.as_impl() + .image_height(args.inputs[i].layout)); + } + } + mgb_assert(WGSX > 0 && WGSY > 0, "invalid tensor for OpenCL jit"); + + if (m_is_debug) { + mgb_log_debug( + "OpenCLExecutable init input tensor array with size: %zu, init output " + "tensor array with size: %zu", + args.inputs.size(), args.outputs.size()); + for (size_t i = 0; i < args.inputs.size(); i++) { + mgb_log_debug( + "input(%zu) dim: %zu %s", i, args.inputs[i].layout.ndim, + args.inputs[i].layout.to_string().c_str()); + } + for (size_t i = 0; i < args.outputs.size(); i++) { + mgb_log_debug( + "output(%zu) dim: %zu %s", i, args.outputs[i].layout.ndim, + args.outputs[i].layout.to_string().c_str()); + } + } + mgb_assert( + args.outputs.size() == 1, "OpenCL elemwise jit output size should be one"); + + //! create kernel + std::string compile_options; + kernel.set_meta_data({compile_options, m_source}); + kernel.set_kernel_name(m_name); + kernel.build_kernel(); + + //! set tensor args + for (size_t i = 0; i < args.inputs.size(); i++) { + if (TensorFormat::Type::IMAGE2D_PACK4 == args.inputs[i].layout.format.type()) { + kernel.add_tensor_image_args( + {{args.inputs[i].from->dev_tensor().raw_ptr(), + args.inputs[i].layout}}); + } else { + //! scalar default format case + kernel.add_tensor_arg( + {args.inputs[i].from->dev_tensor().raw_ptr(), + args.inputs[i].layout}); + } + } + kernel.add_tensor_image_args( + {{args.outputs[0].from->dev_tensor().raw_ptr(), args.outputs[0].layout}}); + + uint32_t block_w = 1, block_h = 1, dimx = 1, dimy = 1; + auto config_super_parameter = [&] { + if (is_adreno) { + block_w = 1; + dimx = 64; + dimy = 1; + } else if (is_mali) { + block_w = 1; + dimx = 96; + dimy = 1; + } else { + //! unknown gpu case + block_w = 1; + dimx = 64; + dimy = 1; + } + //! float16 case + if (dtype::Float16() == args.inputs[0].layout.dtype) { + dimx *= 2; + } + + //! scaling dimx less than gws0, dimy less than gws1 + dimx = std::min(dimx, static_cast((WGSX + block_w - 1) / block_w)); + dimy = std::min(dimy, static_cast((WGSY + block_h - 1) / block_h)); + + //! scaling dimx * dimy less than device max_work_group + dimx = std::min( + dimx, std::max(static_cast(1), max_work_group / dimy)); + }; + + config_super_parameter(); + + //! set other args and config lws and gws + int wc_size = WGSX; + int hb_size = WGSY; + WGSX = (WGSX + block_w - 1) / block_w; + WGSY = (WGSY + block_h - 1) / block_h; + int i_WGSX = safe_int(WGSX); + int i_WGSY = safe_int(WGSY); + kernel.add_args( + {{&i_WGSX, sizeof(int)}, + {&i_WGSY, sizeof(int)}, + {&wc_size, sizeof(int)}, + {&hb_size, sizeof(int)}}); + //! have broadcasted_channel_like_input case + int may_w_size = args.outputs[0].layout[3]; + kernel.add_arg({&may_w_size, sizeof(cl_uint)}); + mgb_log_debug( + "config OpenCL jit kernel args: lws: (%d %d), i_WGSX: %d, i_WGSY: %d " + "wc_size: %d, hb_size: %d, w_size: %d", + dimx, dimy, i_WGSX, i_WGSY, wc_size, hb_size, may_w_size); + + kernel.set_local_size({dimx, dimy}); + kernel.set_global_size_divup_consider_uniform_gws({WGSX, WGSY}); + + //! enqueue kernel + kernel.run(); +} + +#endif // MGB_OPENCL diff --git a/src/jit/impl/tiny_opencl/compiler.h b/src/jit/impl/tiny_opencl/compiler.h new file mode 100644 index 0000000000000000000000000000000000000000..92b2f68cf2c89fcbf7c63fb4ddf0442fd0c24e27 --- /dev/null +++ b/src/jit/impl/tiny_opencl/compiler.h @@ -0,0 +1,54 @@ +#pragma once + +#include "megbrain_build_config.h" +#if MGB_OPENCL + +#include "megbrain/jit/compiler.h" + +namespace mgb { +namespace jit { + +/*! + * \brief Executable class for OPENCL + */ +class OpenCLExecutable final : public Executable { +public: + OpenCLExecutable(std::string source, std::string name, bool is_debug); + ~OpenCLExecutable() = default; + + /*! + * \brief execute + * A Executable instance can be executed by one or more fusion_opr + */ + void execute(JITExecutor* fusion_opr) override final; + +private: + const std::string m_source; + const std::string m_name; + bool m_is_debug; +}; + +/*! + * \brief OpenCL tiny compiler, now only handle elemwise opr and just call DNN CL runtime + */ +class OpenCLTinyCompiler final : public Compiler { + std::unique_ptr do_compile( + const InternalGraph& graph, const JITExecutor::Args& args) override; + bool m_is_debug; + +public: + OpenCLTinyCompiler(CompNode::DeviceType device_type = CompNode::DeviceType::OPENCL); + Property property() const override { + using F = Property::Flag; + return Property{F::BIND_NDIM | F::BIND_SHAPE, JITFeatureBits::NONE, 64}; + } + + size_t get_nr_workspace_outputs(JITExecutor* opr) const override; + + void init_workspace_size_infer(JITExecutor* opr) override; +}; + +} // namespace jit +} // namespace mgb + +#endif // MGB_OPENCL diff --git a/src/jit/impl/tiny_opencl/utils.cpp b/src/jit/impl/tiny_opencl/utils.cpp new file mode 100644 index 0000000000000000000000000000000000000000..dbc8b2bdf16a46510201948e4a45df9f46574c4f --- /dev/null +++ b/src/jit/impl/tiny_opencl/utils.cpp @@ -0,0 +1,44 @@ +#include "./utils.h" +#include + +#if MGB_JIT && MGB_OPENCL + +std::vector get_channel_broadcast_info( + const mgb::jit::JITExecutor::Args& args) { + auto output_dim = args.outputs[0].layout.ndim; + auto& out_layout = args.outputs[0].layout; + mgb_assert( + out_layout.ndim == 5, + "code issue happened, OpenCL jit only support image now"); + auto n = out_layout[0]; + auto c = out_layout[2] * 4; + auto h = out_layout[1]; + auto w = out_layout[3]; + + std::vector ret; + for (size_t i = 0; i < args.inputs.size(); i++) { + if (args.inputs[i].layout.is_scalar()) { + ret.push_back(LayoutType::SCALAR); + } else { + auto& in_layout = args.inputs[i].layout; + auto in = in_layout[0]; + auto ic = in_layout[2] * 4; + auto ih = in_layout[1]; + auto iw = in_layout[3]; + mgb_assert( + in_layout.ndim == output_dim && in == n && ic == c, + "invalid args for OpenCL jit"); + + if (ih == h && iw == w) { + ret.push_back(LayoutType::VEC); + } else { + ret.push_back(LayoutType::BROADCAST); + mgb_assert(ih == 1 && iw == 1, "invalid args for OpenCL jit"); + } + } + } + + return ret; +} + +#endif diff --git a/src/jit/impl/tiny_opencl/utils.h b/src/jit/impl/tiny_opencl/utils.h new file mode 100644 index 0000000000000000000000000000000000000000..e776ad3ed55b73b3406fe1c8e513f83083de3552 --- /dev/null +++ b/src/jit/impl/tiny_opencl/utils.h @@ -0,0 +1,36 @@ +#include "megbrain_build_config.h" + +#if MGB_JIT && MGB_OPENCL + +#include "megbrain/jit/compiler.h" + +template +T safe_icast(S val) { + static_assert( + std::is_integral::value && std::is_integral::value, "must be int"); + mgb_assert( + val <= static_cast(std::numeric_limits::max()) && + val >= static_cast(0)); + return static_cast(val); +} + +template +int safe_int(S val) { + return safe_icast(val); +} + +enum class LayoutType { + SCALAR = 0, + BROADCAST = 1, + VEC = 2, +}; + +/*! + * \brief get inputs channel broadcast info + * \param args of mgb::jit::JITExecutor::Args + * \return input idx is channel boardcast + */ +std::vector get_channel_broadcast_info( + const mgb::jit::JITExecutor::Args& args); + +#endif diff --git a/src/jit/include/megbrain/jit/ast_c.h b/src/jit/include/megbrain/jit/ast_c.h index e379589e7e299da1313d8613b040ddebcd744e49..5c1d31e012c9f7dd05f1ca8d2fde20b97965a898 100644 --- a/src/jit/include/megbrain/jit/ast_c.h +++ b/src/jit/include/megbrain/jit/ast_c.h @@ -42,11 +42,12 @@ public: inline ASTPtr(int imm); inline ASTPtr(float imm); + inline ASTPtr(float imm, CompNode::DeviceType cn_type, bool is_half); }; using ASTPtrArray = SmallVector; //! function type for generating AST nodes -using AstGenerator = thin_function; +using AstGenerator = thin_function; class IntAST : public AST { public: @@ -59,11 +60,20 @@ private: class FloatAST : public AST { public: - FloatAST(float val) : m_val(val) {} - inline std::string code_gen() override { return ssprintf("float(%.12e)", m_val); } + FloatAST( + float val, CompNode::DeviceType cn_type = CompNode::DeviceType::CPU, + bool is_half = false) + : m_val(val), m_cn_type(cn_type), m_is_half(is_half) {} + inline std::string code_gen() override { + + mgb_assert(!m_is_half, "code issue, only OpenCL support as half now"); + return ssprintf("float(%.12e)", m_val); + } private: float m_val; + CompNode::DeviceType m_cn_type; + bool m_is_half; }; class VariableAST : public AST { @@ -139,13 +149,20 @@ public: class DeclFloatAST : public AST { public: - DeclFloatAST(const ASTPtr& var) : m_var(var) {} + DeclFloatAST( + const ASTPtr& var, CompNode::DeviceType cn_type = CompNode::DeviceType::CPU, + bool is_half = false) + : m_var(var), m_cn_type(cn_type), m_is_half(is_half) {} inline std::string code_gen() override { + + mgb_assert(!m_is_half, "code issue, only OpenCL support as half now"); return "float " + m_var->code_gen() + ";"; } private: ASTPtr m_var; + CompNode::DeviceType m_cn_type; + bool m_is_half; }; class DeclIntAST : public AST { @@ -205,23 +222,29 @@ ASTPtr::ASTPtr(int imm) : m_ptr(std::make_shared(imm)) {} ASTPtr::ASTPtr(float imm) : m_ptr(std::make_shared(imm)) {} +ASTPtr::ASTPtr(float imm, CompNode::DeviceType cn_type, bool is_half) + : m_ptr(std::make_shared(imm, cn_type, is_half)) {} + using ElemMode = opr::Elemwise::Mode; using ElemGeneratorMap = ThinHashMap; //! mapping from elemwise mode to ast node generator -const ElemGeneratorMap& elem_opr_generator(); +const ElemGeneratorMap& elem_opr_generator(CompNode::DeviceType type); -static inline bool check_elem_mode(ElemMode mode) { - return elem_opr_generator().count(mode); +static inline bool check_elem_mode(ElemMode mode, CompNode::DeviceType type) { + return elem_opr_generator(type).count(mode); } /*! * \brief Generate a AST node from the opr and the given ast inputs * \param opr the opr * \param inputs the AST inputs of the ASTs to be generate + * \param device_type jit backend cn device type * \return AST nodes corresponding to opr value outputs */ -ASTPtrArray opr2AST(cg::OperatorNodeBase* opr, const ASTPtrArray& inputs); +ASTPtrArray opr2AST( + cg::OperatorNodeBase* opr, const ASTPtrArray& inputs, + CompNode::DeviceType device_type); } // namespace ast_c } // namespace jit diff --git a/src/jit/include/megbrain/jit/executor_opr.h b/src/jit/include/megbrain/jit/executor_opr.h index 39bef6d8c406fd17b27e83ab172868249920bec6..87ffda4f8476ba80e2df67fd1d13c800d1197f08 100644 --- a/src/jit/include/megbrain/jit/executor_opr.h +++ b/src/jit/include/megbrain/jit/executor_opr.h @@ -40,7 +40,7 @@ public: const InternalGraphPtr& internal_graph, const VarNodeArray& inputs, const OperatorNodeConfig& config); - static SymbolVar make( + MGE_WIN_DECLSPEC_FUC static SymbolVar make( const InternalGraphPtr& internal_graph, const VarNodeArray& inputs, const OperatorNodeConfig& config = {}); diff --git a/src/jit/include/megbrain/jit/internal_graph.h b/src/jit/include/megbrain/jit/internal_graph.h index 728e74e90333dd3ec22fa619e081cf59c94938a3..96b99e6d097dbd959e254ae9e7c80f35100dabbf 100644 --- a/src/jit/include/megbrain/jit/internal_graph.h +++ b/src/jit/include/megbrain/jit/internal_graph.h @@ -82,10 +82,10 @@ class InternalGraphGenerator { void find_oprs_depended_by_dimshuffle(cg::OperatorNodeBase* opr); public: - explicit InternalGraphGenerator(cg::OperatorNodeBase* opr); + MGE_WIN_DECLSPEC_FUC explicit InternalGraphGenerator(cg::OperatorNodeBase* opr); //! generate the graph; this method can be called multiple times - InternalGraphPtr generate(); + MGE_WIN_DECLSPEC_FUC InternalGraphPtr generate(); /*! * \brief needed input vars in the original (i.e. outer) graph @@ -120,7 +120,7 @@ public: size_t get_cnt_input_if_add(cg::OperatorNodeBase* opr) const; //! add an operator into this graph; its outputs must have been added - void add_opr(cg::OperatorNodeBase* opr); + MGE_WIN_DECLSPEC_FUC void add_opr(cg::OperatorNodeBase* opr); //! output var in the outer graph (i.e. the root node) VarNode* output() const { return m_output; }