From f9d521e9b11986235448094321a08ef009e1a6a7 Mon Sep 17 00:00:00 2001 From: dabaiji Date: Thu, 23 Jul 2020 15:42:42 +0800 Subject: [PATCH] refactor build module to support gpu --- python/akg/build_module.py | 25 +++--- python/akg/ms/op_build.py | 5 +- python/akg/utils/kernel_exec.py | 9 ++- src/codegen/build_module.cc | 133 ++++++++++++++++++++++---------- src/composite/composite.cc | 4 +- src/include/build_module.h | 6 +- 6 files changed, 115 insertions(+), 67 deletions(-) diff --git a/python/akg/build_module.py b/python/akg/build_module.py index 7a4f91b..6c593a3 100644 --- a/python/akg/build_module.py +++ b/python/akg/build_module.py @@ -80,9 +80,9 @@ def build_config(**kwargs): @vc_util.check_input_type(schedule.Schedule, (list, tuple), (list, tuple), str, - (dict, type(None)), (dict, type(None)), bool, bool, bool, bool) + (dict, type(None)), (dict, type(None)), bool, bool, bool, str) def lower(sch, args, shape_params=None, name="default_function", binds=None, attrs=None, - simple_mode=False, polyhedral=False, tuning=False, aicpu=False): + simple_mode=False, polyhedral=False, tuning=False, target="cce"): """Lowering function.""" tmp_binds = None if binds is not None: @@ -96,7 +96,7 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att cfg = _api_internal._GetCurrentBuildConfig() ret = _api_internal._Lower(sch, args, shape_params, name, tmp_binds, tmp_attrs, simple_mode, - polyhedral, tuning, aicpu, cfg) + polyhedral, tuning, target, cfg) level = tmp_attrs.get("help_tiling") if tuning or (level is not None and level > help_tiling_level['None']): @@ -116,9 +116,9 @@ def lower(sch, args, shape_params=None, name="default_function", binds=None, att @vc_util.check_input_type(schedule.Schedule, (list, tuple), (list, tuple, type(None)), str, - (dict, type(None)), (dict, type(None)), bool, bool) + (dict, type(None)), (dict, type(None)), bool, str) def build_to_func(inputs, args, shape_params=None, name="default_function", - binds=None, attrs=None, polyhedral=False, aicpu=False): + binds=None, attrs=None, polyhedral=False, target="cce"): """Build module.""" tmp_binds = None if binds is not None: @@ -132,14 +132,13 @@ def build_to_func(inputs, args, shape_params=None, name="default_function", shape_params = [] cfg = _api_internal._GetCurrentBuildConfig() return _api_internal._BuildToFunc(inputs, args, shape_params, name, tmp_binds, tmp_attrs, - polyhedral, aicpu, cfg) + polyhedral, target, cfg) -@vc_util.check_input_type(schedule.Schedule, (list, tuple), (str, type(None)), (list, tuple), str, - (dict, type(None)), (dict, type(None)), bool, bool) -def build(inputs, args, target=None, shape_params=None, name="default_function", - binds=None, attrs=None, polyhedral=False, aicpu=False): +@vc_util.check_input_type(schedule.Schedule, (list, tuple), str, (list, tuple), str, + (dict, type(None)), (dict, type(None)), bool) +def build(inputs, args, target='cce', shape_params=None, name="default_function", + binds=None, attrs=None, polyhedral=False): tmp_rst = build_to_func(inputs, args, shape_params=shape_params, name=name, binds=binds, - attrs=attrs, polyhedral=polyhedral, aicpu=aicpu) + attrs=attrs, polyhedral=polyhedral, target=target) - tmp_target = target if target is not None else 'cce' - return _api_internal._BuildToModule(tmp_rst, tmp_target) + return _api_internal._BuildToModule(tmp_rst, target) diff --git a/python/akg/ms/op_build.py b/python/akg/ms/op_build.py index 1bb70b0..0e16289 100644 --- a/python/akg/ms/op_build.py +++ b/python/akg/ms/op_build.py @@ -42,7 +42,6 @@ def op_build_to_func(opnames, computes, args, custom_schedule, device, kernel_na logging.error("Device %s is not in [aicore, aicpu].", device) return None - aicpu = device == "aicpu" polyhedral = True dump_ir = os.getenv(MS_AKG_DUMP_IR) == "on" @@ -57,9 +56,9 @@ def op_build_to_func(opnames, computes, args, custom_schedule, device, kernel_na if attrs: binds = attrs.pop(BINDS, None) rst = akg.build_to_func(s, args, name=kernel_name, attrs=attrs, polyhedral=polyhedral, - binds=binds, aicpu=aicpu) + binds=binds, target=device) else: - rst = akg.build_to_func(s, args, name=kernel_name, polyhedral=polyhedral, aicpu=aicpu) + rst = akg.build_to_func(s, args, name=kernel_name, polyhedral=polyhedral, target=device) except Exception: logging.error(traceback.format_exc()) diff --git a/python/akg/utils/kernel_exec.py b/python/akg/utils/kernel_exec.py index 861107b..97ebf16 100644 --- a/python/akg/utils/kernel_exec.py +++ b/python/akg/utils/kernel_exec.py @@ -724,13 +724,14 @@ def op_build(op_func, input_shapes, input_types, op_attrs=None, kernel_name="", if TensorUtils.is_output_value(output): op_var = op_var + [output] - if sch_tmpl != None: - assert(sch_tmpl['target'] == 'cuda') + if sch_tmpl is not None: + if sch_tmpl['target'] != 'cuda': + raise ValueError("Only support cuda as target when using schedule template.") kernel_name = kernel_name if kernel_name != "" else sch_tmpl['op_name'] with akg.tvm.target.cuda() as target: s = sch_tmpl['schedule'](sch_tmpl['output']) - with akg.tvm.build_config(dump_pass_ir = True): - mod = akg.tvm.build(s, op_var, target, target_host = 'stackvm', name = kernel_name) + with akg.build_config(dump_pass_ir=True): + mod = akg.build(s, op_var, "cuda", shape_var, name=kernel_name, attrs=attrs, polyhedral=polyhedral, binds=binds) dump_cuda_meta.dump(mod, kernel_name, s, op_var) return mod diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index 15030ab..ece8369 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -436,7 +436,7 @@ void FixParametricBinds(const Map &binds, const Array & NodeRef Lower(Schedule sch, const Array &in_args, const Array &shape_vars, const std::string &name, const Map &in_binds, const Map &in_attrs, bool simple_mode, - bool polyhedral, bool tuning, bool aicpu, const BuildConfig &config) { + bool polyhedral, bool tuning, const std::string &target, const BuildConfig &config) { ir::TestExprCompuationSimplify(); CHECK(sch.defined()) << "sch is not defined."; CHECK(!name.empty()) << "name is empty."; @@ -486,6 +486,41 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array auto new_sch = sch.normalize(); auto bounds = air::schedule::InferBound(new_sch); Stmt stmt = make_pass("schedule.ScheduleOps", new_sch, bounds, false); + + if (target == "cuda") { + // Phase 1 + stmt = NEXT_PASS(RewriteForTensorCore, stmt, new_sch, binds_0); + stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64, config->instrument_bound_checkers); + stmt = NEXT_PASS(CanonicalSimplify, stmt); + + // Phase 2 + if (!simple_mode) { + stmt = NEXT_PASS(LoopPartition, stmt, config->partition_const_loop); + } + if (config->disable_vectorize) { + stmt = NEXT_PASS(SkipVectorize, stmt); + } else { + stmt = NEXT_PASS(VectorizeLoop, stmt); + } + stmt = NEXT_PASS(InjectVirtualThread, stmt); + stmt = NEXT_PASS(InjectDoubleBuffer, stmt, config->double_buffer_split_loop); + stmt = NEXT_PASS(StorageRewrite, stmt); + stmt = NEXT_PASS(UnrollLoop, stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth, + config->auto_unroll_max_extent, config->unroll_explicit); + + // Phase 3 + stmt = NEXT_PASS(Simplify, stmt); + stmt = NEXT_PASS(RemoveNoOp, stmt); + if (config->instrument_bound_checkers) { + stmt = NEXT_PASS(InstrumentBoundCheckers, stmt); + } + if (simple_mode) { + return stmt; + } + LoweredFunc lowered_func = NEXT_PASS(MakeAPI, stmt, name, arg_list_0, 0, config->restricted_func); + return lowered_func; + } + if (!polyhedral) { // for conv-matmul manual schedule stmt = NEXT_PASS(AutoMadPragmaAttr, stmt, true); @@ -518,7 +553,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array PassMgr::SetArgs(arg_list_0); - if (!aicpu) { + if (target != "aicpu") { stmt = NEXT_PASS(MathIntrinRewrite, stmt); } @@ -527,7 +562,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array } // Phase 1 - if (!aicpu && polyhedral) { + if (target != "aicpu" && polyhedral) { stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0); stmt = NEXT_PASS(CheckShapeParams, stmt, binds_0); stmt = NEXT_PASS(AlignPartitionCCE, stmt); @@ -597,12 +632,13 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array } // micro-tuning configs: current strategy is to retry autopoly up to 3 times when storage flatten/rewrite fails - bool need_micro_tuning = !aicpu && polyhedral && !is_dynamic && global_attrs.GetStringAttr("dim", "").empty(); + bool need_micro_tuning = + target != "aicpu" && polyhedral && !is_dynamic && global_attrs.GetStringAttr("dim", "").empty(); const int max_enter_poly_times = global_attrs.GetIntAttr(kMaxNumRetryPoly, need_micro_tuning ? 4 : 1); int enter_count = 0; Stmt stmt_before_poly = stmt; while (enter_count < max_enter_poly_times) { - if (!aicpu && polyhedral) { + if (target != "aicpu" && polyhedral) { Array poly_res = NEXT_PASS(AutoPoly, stmt_before_poly, binds_0, global_attrs, false, is_dynamic); enter_count++; CHECK_EQ(poly_res.size(), 2); @@ -704,7 +740,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array // Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning, // 4 : whether to partition convolution or not - if (!aicpu && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) { + if (target != "aicpu" && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) { stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral); } @@ -731,7 +767,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array stmt = NEXT_PASS(FixLoopExtent, stmt); } - if (!aicpu) { + if (target != "aicpu") { stmt = NEXT_PASS(AutoPragma, stmt); } stmt = NEXT_PASS(EliminateAtomicDma, stmt); @@ -741,7 +777,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array if (is_dynamic) { stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true), - global_attrs.GetIntAttr(kEnableScalarAlign, false)); + global_attrs.GetIntAttr(kEnableScalarAlign, false)); } else { stmt = NEXT_PASS(RewriteBroadcastVector, stmt); stmt = NEXT_PASS(OptimizePragma, stmt); @@ -815,7 +851,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array stmt = NEXT_PASS(AutoDoubleBuffer, stmt); } stmt = NEXT_PASS(InjectAccessPtrMSG, stmt); - if (!aicpu) { + if (target != "aicpu") { stmt = NEXT_PASS(InjectPipe, stmt); } stmt = NEXT_PASS(ModDivEliminate, stmt); @@ -853,7 +889,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array stmt = NEXT_PASS(SpecialValueReplacer, stmt); stmt = NEXT_PASS(Simplify, stmt); - if (!aicpu) { + if (target != "aicpu") { stmt = NEXT_PASS(InjectSync, stmt); } @@ -925,52 +961,65 @@ void BuildForDevice(const Array &flist, const std::string &target_n TVMContext context{kDLCce, 0}; DLDeviceType device_type = context.device_type; - Array out_flist_0; + Array fhost; Array fdevice; - for (const auto &func : flist) { + for (auto func : flist) { if (func->func_type == air::LoweredFuncType::kMixedFunc) { + if (target_name == "cuda") { + if (BuildConfig::Current()->detect_global_barrier) { + func = NEXT_PASS(ThreadSync, func, "global"); + } + func = NEXT_PASS(ThreadSync, func, "shared"); + func = NEXT_PASS(ThreadSync, func, "warp"); + func = NEXT_PASS(InferFragment, func); + func = NEXT_PASS(LowerThreadAllreduce, func, target->thread_warp_size); + } Array fsplits = NEXT_PASS(SplitHostDevice, func); - out_flist_0.push_back(fsplits[0]); + fhost.push_back(fsplits[0]); for (size_t idx = 1; idx < fsplits.size(); idx++) { fdevice.push_back(fsplits[idx]); } } else if (func->func_type == air::LoweredFuncType::kHostFunc) { - out_flist_0.push_back(func); + fhost.push_back(func); } else if (func->func_type == air::LoweredFuncType::kDeviceFunc) { - out_flist_0.push_back(func); + fdevice.push_back(func); } else { LOG(FATAL) << "unknown function type " << func->func_type; } } - Array out_flist_1; - for (const auto &func : out_flist_0) { - LoweredFunc lowered_func = NEXT_PASS(BindDeviceType, func, static_cast(device_type)); - out_flist_1.push_back(lowered_func); + if (target_name == "cuda") { + for (size_t i = 0; i < fdevice.size(); ++i) { + fdevice.Set(i, NEXT_PASS(LowerWarpMemory, fdevice[i], target->thread_warp_size)); + } } - Array out_flist_2; - for (const auto &func : out_flist_1) { - LoweredFunc lowered_func = NEXT_PASS(LowerTVMBuiltin, func); - out_flist_2.push_back(lowered_func); + + for (size_t i = 0; i < fhost.size(); ++i) { + fhost.Set(i, NEXT_PASS(BindDeviceType, fhost[i], static_cast(device_type))); + fhost.Set(i, NEXT_PASS(LowerTVMBuiltin, fhost[i])); } Target target_host = Target::Create(target_host_name); - Array fdevice_0; - for (const auto &func : fdevice) { - LoweredFunc lowered_func = NEXT_PASS(LowerIntrin, func, target->target_name); - fdevice_0.push_back(lowered_func); + + for (size_t i = 0; i < fdevice.size(); ++i) { + if (target_name == "cuda") { + fdevice.Set(i, NEXT_PASS(LowerDeviceStorageAccessInfo, fdevice[i])); + } + fdevice.Set(i, NEXT_PASS(LowerIntrin, fdevice[i], target->target_name)); } - Array out_flist_3; - for (const auto &func : out_flist_2) { - LoweredFunc lowered_func = NEXT_PASS(LowerIntrin, func, target_host->target_name); - out_flist_3.push_back(lowered_func); + for (size_t i = 0; i < fhost.size(); ++i) { + if (target_name == "cuda") { + fhost.Set(i, NEXT_PASS(LowerDeviceStorageAccessInfo, fhost[i])); + } + fhost.Set(i, NEXT_PASS(LowerIntrin, fhost[i], target_host->target_name)); + fhost.Set(i, NEXT_PASS(CombineContextCall, fhost[i])); } - for (const auto &func : out_flist_3) { - LoweredFunc lowered_func = NEXT_PASS(CombineContextCall, func); - out_flist->push_back(lowered_func); + + for (const auto &func : fhost) { + out_flist->push_back(func); } - *out_mdev = air::codegen::Build(fdevice_0, target_name, g_external_call_name); + *out_mdev = air::codegen::Build(fdevice, target_name, g_external_call_name); return; } @@ -987,7 +1036,7 @@ TVM_REGISTER_NODE_TYPE(BuildRstNode); BuildRst BuildToFunc(const Schedule &inputs, const Array &in_args, const Array &shape_vars, const std::string &name, const Map &in_binds, - const Map &in_attrs, bool polyhedral, bool aicpu, + const Map &in_attrs, bool polyhedral, const std::string &target, const BuildConfig &config) { CHECK(inputs.defined()) << "inputs is not defined."; CHECK(!name.empty()) << "name is empty."; @@ -1005,7 +1054,7 @@ BuildRst BuildToFunc(const Schedule &inputs, const Array &in_args, cons attrs = in_attrs; } - auto rst = Lower(inputs, args, shape_vars, name, binds, attrs, false, polyhedral, false, aicpu, config); + auto rst = Lower(inputs, args, shape_vars, name, binds, attrs, false, polyhedral, false, target, config); return BuildRstNode::make(rst, name); } @@ -1073,11 +1122,11 @@ air::runtime::Module BuildToModule(const NodeRef &ref, const std::string &target } air::runtime::Module BuildModule(const Schedule &inputs, const Array &in_args, - const Array &shape_vars, const std::string &target_name, - const std::string &name, const Map &in_binds, - const Map &in_attrs, bool polyhedral, bool aicpu, - const BuildConfig &config) { - auto func = BuildToFunc(inputs, in_args, shape_vars, name, in_binds, in_attrs, polyhedral, aicpu, config); + const Array &shape_vars, const std::string &target_name, + const std::string &name, const Map &in_binds, + const Map &in_attrs, bool polyhedral, const std::string &target, + const BuildConfig &config) { + auto func = BuildToFunc(inputs, in_args, shape_vars, name, in_binds, in_attrs, polyhedral, target, config); return BuildToModule(func, target_name); } diff --git a/src/composite/composite.cc b/src/composite/composite.cc index 8be07af..7ad315b 100644 --- a/src/composite/composite.cc +++ b/src/composite/composite.cc @@ -454,7 +454,7 @@ NodeRef composite_with_json_to_func(const std::string &json_str, Mapdump_pass_ir = akg_dump_pass_ir != nullptr; attrs.Set("pragma_reschedule", make_const(Int(32), 1)); - auto build_rst = akg::BuildToFunc(sch, args, shape_vars, kernel_name, in_binds, attrs, true, false, config); + auto build_rst = akg::BuildToFunc(sch, args, shape_vars, kernel_name, in_binds, attrs, true, "cce", config); CHECK(build_rst.defined()); return build_rst; } @@ -519,7 +519,7 @@ NodeRef composite_lower(const std::string &json_str, Map a akg::BuildConfig config = akg::BuildConfig::Current(); CHECK(config.defined()); bool tuning = attrs.find("tuning") != attrs.end(); - return akg::Lower(sch, args, shape_vars, kernel_name, in_binds, attrs, false, true, tuning, false, config); + return akg::Lower(sch, args, shape_vars, kernel_name, in_binds, attrs, false, true, tuning, "cce", config); } TVM_REGISTER_GLOBAL("composite_with_json_to_func").set_body_typed(composite_with_json_to_func); diff --git a/src/include/build_module.h b/src/include/build_module.h index 2716748..020b4e2 100644 --- a/src/include/build_module.h +++ b/src/include/build_module.h @@ -47,19 +47,19 @@ class MemoryAllocationException : public std::exception { NodeRef Lower(Schedule sch, const Array &in_args, const Array &shape_vars, const std::string &name, const Map &in_binds, const Map &in_attrs, bool simple_mode, - bool polyhedral, bool tuning, bool aicpu, const BuildConfig &config); + bool polyhedral, bool tuning, const std::string &target, const BuildConfig &config); air::runtime::Module BuildModule(const Schedule &inputs, const Array &in_args, const Array &shape_vars, const std::string &target_name, const std::string &name, const Map &in_binds, - const Map &in_attrs, bool polyhedral, bool aicpu, + const Map &in_attrs, bool polyhedral, const std::string &target, const BuildConfig &config); class BuildRst; BuildRst BuildToFunc(const Schedule &inputs, const Array &in_args, const Array &shape_vars, const std::string &name, const Map &in_binds, - const Map &in_attrs, bool polyhedral, bool aicpu, const BuildConfig &config); + const Map &in_attrs, bool polyhedral, const std::string &target, const BuildConfig &config); air::runtime::Module BuildToModule(const NodeRef &ref, const std::string &target_name = "cce"); -- GitLab