/** * Copyright 2019 Huawei Technologies Co., Ltd * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include #include #include #include #include #include #include #include #include #include "build_module.h" #include "pass/expr_alg_simplify.h" #include "ir_pass.h" #include "schedule_pass.h" #include "codegen/pass_mgr.h" #include "composite/util.h" namespace akg { AttrMap global_attrs; Array g_external_call_name; Tensor CreatePlaceholder(const NodeRef &arg) { auto n = air::make_node(); if (auto var_node = arg.as()) { n->name = var_node->name_hint; n->shape = Array{GetRef(var_node)}; n->dtype = var_node->type; } else if (auto buffer_node = arg.as()) { n->name = buffer_node->name; Expr size = std::accumulate(buffer_node->shape.begin(), buffer_node->shape.end(), Expr(1), [](const Expr &mul, const Expr &e) { return mul * e; }); n->shape = Array{air::ir::Simplify(size)}; n->dtype = buffer_node->dtype; } else if (auto tensor_node = arg.as()) { n->name = tensor_node->op->name; Expr size = std::accumulate(tensor_node->shape.begin(), tensor_node->shape.end(), Expr(1), [](const Expr &mul, const Expr &e) { return mul * e; }); n->shape = Array{air::ir::Simplify(size)}; n->dtype = tensor_node->dtype; } else { LOG(FATAL) << "arg must be Tensor, Buffer or Var, but got " << arg; } return Operation(n).output(0); } Buffer DeclBuffer(const NodeRef &arg, const int data_alignment, const int offset_factor, const std::string &pre_name = "") { // use default value. Array strides; Type dtype; Array shape; std::string name = pre_name; if (auto variable_node = arg.as()) { if (name.empty()) { name = variable_node->name_hint; } shape = Array{GetRef(variable_node)}; dtype = variable_node->type; } else if (auto buffer_node = arg.as()) { if (name.empty()) { name = buffer_node->name; } shape = buffer_node->shape; dtype = buffer_node->dtype; } else if (auto tensor_node = arg.as()) { if (name.empty()) { name = tensor_node->op->name; } shape = tensor_node->shape; dtype = tensor_node->dtype; } else { LOG(FATAL) << "args must be Tensor, Buffer or Var, but got " << arg; } auto data = Variable::make(Handle(), name); Expr elem_offset; if (offset_factor != 0) { elem_offset = Variable::make(shape[0].type(), name + "_elem_offset"); } return BufferNode::make(data, dtype, shape, strides, elem_offset, name, "", data_alignment, offset_factor, BufferType::kDefault); } void GetBinds(const Array &args, const Map &binds, const BuildConfig &config, Array *out_args, Map *out_binds) { for (const auto &b : binds) { out_binds->Set(b.first, b.second); } for (const auto &x : args) { if (x->IsInstance()) { auto tensor_node = GetRef(x.as()); if (out_binds->find(tensor_node) == out_binds->end()) { auto buf = DeclBuffer(tensor_node, config->data_alignment, config->offset_factor); out_binds->Set(tensor_node, buf); out_args->push_back(buf); } else { out_args->push_back((*out_binds)[tensor_node]); } } else if (x->IsInstance()) { out_args->push_back(x); } else if (x->IsInstance()) { out_args->push_back(x); } else { LOG(FATAL) << "args must be Tensor, Buffer or Var, but got " << x; } } return; } void GetFlattenedBinds(const Array &args, const Map &binds, const BuildConfig &config, Array &out_args, Map &out_binds, bool is_dynamic) { std::unordered_map flag_binds; // the map aims to remove duplicate names between binds and args // because in-place ops (e.g. assign_add) use the same buffer as input and args, and duplicates need to be removed std::unordered_map bind_name_to_buffer_map; for (const auto &b : binds) { static_cast(bind_name_to_buffer_map.emplace(b.first->op->func_name(), b.second)); } for (const auto &x : args) { if (x->IsInstance()) { auto tensor_node = GetRef(x.as()); auto tensor_name = tensor_node->op->func_name(); CHECK_NE(bind_name_to_buffer_map.count(tensor_name), 0) << "undefined tensor " << x; auto bind_buffer = bind_name_to_buffer_map[tensor_name]; flag_binds[tensor_node] = true; Tensor nx = CreatePlaceholder(tensor_node); bool find_buf = false; for (auto iter : out_binds) { Buffer buffer = iter.second; if (bind_buffer->name == buffer->name) { out_binds.Set(nx, buffer); find_buf = true; break; } } if (!find_buf) { Buffer buf = DeclBuffer(nx, config->data_alignment, config->offset_factor, bind_buffer->name); out_binds.Set(nx, buf); out_args.push_back(buf); } else { out_args.push_back(bind_buffer); } } else if (x->IsInstance()) { out_args.push_back(x); } else if (x->IsInstance()) { out_args.push_back(x); } else { LOG(FATAL) << "args must be Tensor, Buffer or Var"; } } for (const auto &x : binds) { Tensor x_tensor = x.first; if (flag_binds.insert(std::pair{x_tensor, true}).second) { Tensor nx = CreatePlaceholder(x_tensor); bool find_buf = false; for (auto iter : out_binds) { Buffer buffer = iter.second; if (binds[x_tensor]->name == buffer->name) { out_binds.Set(nx, buffer); find_buf = true; } } if (!find_buf) { Buffer buf = DeclBuffer(nx, config->data_alignment, config->offset_factor, binds[x_tensor]->name); out_binds.Set(nx, buf); } } } // Just for reshape in dynamic mode if (is_dynamic) { Tensor in_tensor, out_tensor; bool is_reshape = false; if (out_binds.size() == 2 && args.size() == 2) { for (auto tb : out_binds) { if (tb.first->op->name == "reshape" || tb.first->op->name == "reshape_cast") { out_tensor = tb.first; is_reshape = true; } else { in_tensor = tb.first; } } } if (is_reshape) { Map new_binds; Array new_args; auto n = air::make_node(); n->name = out_tensor->op->name; n->shape = in_tensor->shape; n->dtype = out_tensor->dtype; Tensor ten = Operation(n).output(0); Buffer buf = DeclBuffer(ten, config->data_alignment, config->offset_factor, n->name); new_binds.Set(in_tensor, out_binds[in_tensor]); new_binds.Set(ten, buf); new_args.push_back(out_binds[in_tensor]); new_args.push_back(buf); out_binds = new_binds; out_args = new_args; } } } void RenameBinds(Map &binds, const BuildConfig &config, Array &tensor_args_list, Array &buffer_args_list, Map &tensor_replace) { std::unordered_map tensor_name_count; std::set tensor_name; Map out_binds; Map buffer_replace; bool rename_flag = false; // count the number of times for binds name, if op->name's count greater than 1, need rename op->name for (const auto &x : binds) { ++tensor_name_count[x.first->op->name]; } // if binds' name conflict, firstly rename tensor_name, then construct new mappings, finally set to out_binds for (const auto &x : binds) { const auto &old_tensor = x.first; const auto &old_buffer = x.second; if (tensor_name_count[old_tensor->op->name] > 1) { int idx = 0; std::string new_name = old_tensor->op->name; std::string extend; do { extend = "_rename_" + std::to_string(++idx); } while (tensor_name.count(new_name + extend) != 0); new_name.append(extend); tensor_name.insert(new_name); auto cop = old_tensor->op.as(); CHECK(cop != nullptr); Tensor new_tensor = air::ComputeOpNode::make(new_name, cop->tag, cop->attrs, cop->axis, cop->body).output(0); tensor_replace.Set(old_tensor, new_tensor); if (buffer_replace.count(old_buffer) == 0) { auto new_buffer = DeclBuffer(new_tensor, config->data_alignment, config->offset_factor, new_name); buffer_replace.Set(old_buffer, new_buffer); out_binds.Set(new_tensor, new_buffer); } rename_flag = true; } } // if there is no conflict in binds name, just do out_binds = binds // else need use new_buffer to replace old_buffer to insert out_binds auto UpdateOutBinds = [&](Map &out_binds) -> Map & { for (const auto &it : binds) { const auto &tensor_node = it.first; const auto &buffer_node = it.second; if (tensor_name_count[tensor_node->op->name] == 1) { if (buffer_replace.count(buffer_node) > 0) { out_binds.Set(tensor_node, buffer_replace[buffer_node]); } else { out_binds.Set(tensor_node, buffer_node); } } } return out_binds; }; // traverse the list of tensor_args, according to tensor_node to update tensor_args_list auto UpdateArgsByTensor = [&tensor_args_list, &tensor_replace]() { Array new_tensor_args_list; for (const auto &x : tensor_args_list) { if (x->IsInstance()) { Tensor tensor_node = GetRef(x.as()); if (tensor_replace.count(tensor_node) != 0) { new_tensor_args_list.push_back(tensor_replace[tensor_node]); } else { new_tensor_args_list.push_back(tensor_node); } } else { new_tensor_args_list.push_back(x); } } return new_tensor_args_list; }; // traverse the list of buffer_args, according to buffer_node to update buffer_args_list auto UpdateArgsByBuffer = [&buffer_args_list, &buffer_replace]() { Array new_buffer_args_list; for (const auto &x : buffer_args_list) { if (x->IsInstance()) { Buffer buffer_node = GetRef(x.as()); if (buffer_replace.count(buffer_node) != 0) { new_buffer_args_list.push_back(buffer_replace[buffer_node]); } else { new_buffer_args_list.push_back(buffer_node); } } else { new_buffer_args_list.push_back(x); } } return new_buffer_args_list; }; // if rename tensor_name, need to update tensor_args and buffer_args if (rename_flag) { tensor_args_list = UpdateArgsByTensor(); buffer_args_list = UpdateArgsByBuffer(); binds = UpdateOutBinds(out_binds); } return; } void FixParametricBinds(const Map &binds, const Array &in_args, const BuildConfig &config, Map *out_binds, Array *out_args) { Expr H = 0; Expr W = 0; Expr PT = 0; Expr PB = 0; Expr PL = 0; Expr PR = 0; Expr KH = 0; Expr KW = 0; Expr SH = 0; Expr SW = 0; Expr CI1 = 0; std::string feature = "input_1_1"; std::string kernel = "input_1_2"; std::string bias = "input_1_3"; std::string output = "output"; Buffer feature_buffer; Buffer kernel_buffer; Buffer bias_buffer; Buffer output_buffer; for (const auto &x : in_args) { if (auto buf = x.as()) { if (buf->name.find(feature) != std::string::npos) { feature_buffer = Downcast(x); } if (buf->name.find(bias) != std::string::npos) { bias_buffer = Downcast(x); } if (buf->name.find(output) != std::string::npos || buf->name.find(kernel) != std::string::npos) { continue; } } if (auto v = x.as()) { if (v->name_hint == "H") { H = Downcast(x); } else if (v->name_hint == "W") { W = Downcast(x); } else if (v->name_hint == "PT") { PT = Downcast(x); } else if (v->name_hint == "PB") { PB = Downcast(x); } else if (v->name_hint == "PL") { PL = Downcast(x); } else if (v->name_hint == "PR") { PR = Downcast(x); } else if (v->name_hint == "KH") { KH = Downcast(x); } else if (v->name_hint == "KW") { KW = Downcast(x); } else if (v->name_hint == "SH") { SH = Downcast(x); } else if (v->name_hint == "SW") { SW = Downcast(x); } else if (v->name_hint == "CI1") { CI1 = Downcast(x); } } } for (const auto &x : binds) { Array shape; if (x.second->name.find(output) != std::string::npos) { CHECK_EQ(x.second->shape.size(), 5); shape.push_back(x.second->shape[0]); shape.push_back(x.second->shape[1]); auto h = air::floordiv(H + PT + PB - KH, SH) + 1; auto w = air::floordiv(W + PL + PR - KW, SW) + 1; shape.push_back(h); shape.push_back(w); shape.push_back(x.second->shape[4]); Tensor tt = air::placeholder(shape, x.second->dtype, x.second->name); output_buffer = DeclBuffer(tt, config->data_alignment, config->offset_factor, x.second->name); out_binds->Set(tt, output_buffer); } else if (x.second->name.find(kernel) != std::string::npos) { CHECK_EQ(x.second->shape.size(), 4); auto n = CI1 * KH * KW; shape.push_back(n); shape.push_back(x.second->shape[1]); shape.push_back(x.second->shape[2]); shape.push_back(x.second->shape[3]); Tensor tt = air::placeholder(shape, x.second->dtype, x.second->name); kernel_buffer = DeclBuffer(tt, config->data_alignment, config->offset_factor, x.second->name); out_binds->Set(tt, kernel_buffer); } else { out_binds->Set(x.first, x.second); } } if (feature_buffer.defined()) { out_args->push_back(feature_buffer); } if (kernel_buffer.defined()) { out_args->push_back(kernel_buffer); } if (bias_buffer.defined()) { out_args->push_back(bias_buffer); } if (output_buffer.defined()) { out_args->push_back(output_buffer); } for (const auto &x : in_args) { if (x.as()) { out_args->push_back(x); } } } NodeRef Lower(Schedule sch, const Array &in_args, const Array &shape_vars, const std::string &name, const Map &in_binds, const Map &in_attrs, bool simple_mode, bool polyhedral, bool tuning, const std::string &target, const BuildConfig &config) { ir::TestExprCompuationSimplify(); CHECK(sch.defined()) << "sch is not defined."; CHECK(!name.empty()) << "name is empty."; CHECK(find_if(name.begin(), name.end(), [](char c) { return !std::isalnum(c) && c != '_'; }) == name.end()) << "kernel name contains invalid chars: " << name; Array args; if (in_args.defined()) { args = in_args; } Map binds; if (in_binds.defined()) { binds = in_binds; } if (in_attrs.defined()) { global_attrs = in_attrs; } PassMgr::ClearPassId(); PassTimer *pass_timer = PassTimer::GetInstance(); global_attrs.Set(kKernelName, StringImm::make(name)); global_attrs.Set(kDumpPassIr, air::make_const(Int(32), config->dump_pass_ir)); if (config->dump_pass_ir) { std::string dump_ir_dir; if (global_attrs.GetStringAttr(kDumpIrDir, &dump_ir_dir)) { PassMgr::SetDir(dump_ir_dir); } else { PassMgr::SetDir(name); } CreateDir(PassMgr::GetDir()); std::string dump_poly_dir; if (!global_attrs.GetStringAttr(kDumpPolyDir, &dump_poly_dir)) { dump_poly_dir = PassMgr::GetDir() + "/poly"; global_attrs.Set(kDumpPolyDir, StringImm::make(dump_poly_dir)); } CreateDir(dump_poly_dir); } Array arg_list_0; Map binds_0; GetBinds(args, binds, config, &arg_list_0, &binds_0); // Phase 0 if (polyhedral && global_attrs.GetBoolAttr(kEnableAutoInline, true)) { akg::schedule::AutoInline(sch); } auto new_sch = sch.normalize(); auto bounds = air::schedule::InferBound(new_sch); Stmt stmt = make_pass("schedule.ScheduleOps", new_sch, bounds, false); if (target == "cuda") { // Phase 1 stmt = NEXT_PASS(RewriteForTensorCore, stmt, new_sch, binds_0); stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64, config->instrument_bound_checkers); stmt = NEXT_PASS(CanonicalSimplify, stmt); // Phase 2 if (!simple_mode) { stmt = NEXT_PASS(LoopPartition, stmt, config->partition_const_loop); } if (config->disable_vectorize) { stmt = NEXT_PASS(SkipVectorize, stmt); } else { stmt = NEXT_PASS(VectorizeLoop, stmt); } stmt = NEXT_PASS(InjectVirtualThread, stmt); stmt = NEXT_PASS(InjectDoubleBuffer, stmt, config->double_buffer_split_loop); stmt = NEXT_PASS(StorageRewrite, stmt); stmt = NEXT_PASS(UnrollLoop, stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth, config->auto_unroll_max_extent, config->unroll_explicit); // Phase 3 stmt = NEXT_PASS(Simplify, stmt); stmt = NEXT_PASS(RemoveNoOp, stmt); if (config->instrument_bound_checkers) { stmt = NEXT_PASS(InstrumentBoundCheckers, stmt); } if (simple_mode) { return stmt; } LoweredFunc lowered_func = NEXT_PASS(MakeAPI, stmt, name, arg_list_0, 0, config->restricted_func); return lowered_func; } if (!polyhedral) { // for conv-matmul manual schedule stmt = NEXT_PASS(AutoMadPragmaAttr, stmt, true); } stmt = NEXT_PASS(RewriteMultiValueFunc, stmt); Map replace; RenameBinds(binds_0, config, args, arg_list_0, replace); PassMgr::SetArgs(arg_list_0); stmt = NEXT_PASS(RenameRealize, stmt, binds_0, replace); bool is_dynamic = !shape_vars.empty(); global_attrs.Set(kIsDynamic, air::make_const(Int(32), is_dynamic)); Array arg_list_1; Map binds_1; GetFlattenedBinds(args, binds_0, config, arg_list_1, binds_1, is_dynamic); Stmt stmt1 = NEXT_PASS(ElementwiseFlatten, stmt, binds_0, binds_1); if (stmt1.get() != stmt.get()) { stmt = stmt1; arg_list_0 = arg_list_1; binds_0 = binds_1; } for (auto &node : shape_vars) { if (node.as()) { arg_list_0.push_back(node); } } PassMgr::SetArgs(arg_list_0); if (target != "aicpu") { stmt = NEXT_PASS(MathIntrinRewrite, stmt); } if (global_attrs.GetBoolAttr(kEnableRewriteScalarCompute, false)) { stmt = NEXT_PASS(ScalarComputeRewrite, stmt); } // Phase 1 if (target != "aicpu" && polyhedral) { stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0); stmt = NEXT_PASS(CheckShapeParams, stmt, binds_0); stmt = NEXT_PASS(AlignPartitionCCE, stmt); // Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning, // 4 : whether to partition convolution or not if (global_attrs.GetBoolAttr(kEnablePrePolyLoopPartition, true)) { stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral); } if (global_attrs.GetBoolAttr(kLoopPartitionUnroll, false)) { stmt = NEXT_PASS(UnrollNonConstantExtent, stmt); } if (global_attrs.GetBoolAttr(kExtentToCond, true)) { stmt = NEXT_PASS(ConvertExtentToCond, stmt, binds_0); } if (global_attrs.GetBoolAttr(kEnableToThreeAddress, true)) { if (global_attrs.count(kToThreeAddressCrossSimply) != 0) { // Not combine with reuse tensors stmt = NEXT_PASS(ToThreeAddress, stmt, false, 0, true); } else { if (global_attrs.GetBoolAttr(kToThreeAddressReuse, false)) { int min_split = global_attrs.GetIntAttr(kToThreeAddressMinSplit, 10); if (min_split > 0) { stmt = NEXT_PASS(ToThreeAddress, stmt, true, min_split); } else { stmt = NEXT_PASS(ToThreeAddress, stmt, true); } } else { stmt = NEXT_PASS(ToThreeAddress, stmt); } } } if (!global_attrs.GetBoolAttr(kDisableCse, false)) { stmt = NEXT_PASS(StmtCSE, stmt, binds_0); } if (!global_attrs.GetBoolAttr(kDisableVn, false)) { stmt = NEXT_PASS(ValueNumbering, stmt); } if (!global_attrs.GetBoolAttr(kDisableHalfToFloatSumOpt, false)) { stmt = NEXT_PASS(HalfReduceSumRewrite, stmt, binds_0); } stmt = NEXT_PASS(StmtPatternRewrite, stmt); stmt = NEXT_PASS(CopyPropagation, stmt, binds_0); stmt = NEXT_PASS(MathIntrinRewrite, stmt); if (global_attrs.GetBoolAttr(kRewriteVarTensorIdx, false)) { stmt = NEXT_PASS(RewriteVarTensorIdx, stmt, binds_0); } else { stmt = NEXT_PASS(RewriteTensorIndex, stmt); } if (global_attrs.GetBoolAttr(kEnableFeatureLibrary, false) || global_attrs.GetBoolAttr(kEnableFeatureLibraryPrePoly, false)) { stmt = NEXT_PASS(FeatureLibTransform, stmt); } stmt = NEXT_PASS(UnrollLoop, stmt, -1, -1, 1, true); stmt = NEXT_PASS(SinkIfStmt, stmt); int level = global_attrs.GetIntAttr(kHelpTiling, -1); if (tuning || level > help_tiling_level["None"]) { if (tuning) { level = help_tiling_level["Tuning"]; } Map attrs_1 = global_attrs; attrs_1.Set(kDumpTuningLevel, air::make_const(Int(32), level)); NodeRef tuning_spaces = NEXT_PASS(GenTuningSpace, stmt, binds_0, attrs_1, false); return tuning_spaces; } } // micro-tuning configs: current strategy is to retry autopoly up to 3 times when storage flatten/rewrite fails bool need_micro_tuning = target != "aicpu" && polyhedral && !is_dynamic && global_attrs.GetStringAttr("dim", "").empty(); const int max_enter_poly_times = global_attrs.GetIntAttr(kMaxNumRetryPoly, need_micro_tuning ? 4 : 1); int enter_count = 0; Stmt stmt_before_poly = stmt; while (enter_count < max_enter_poly_times) { if (target != "aicpu" && polyhedral) { Array poly_res = NEXT_PASS(AutoPoly, stmt_before_poly, binds_0, global_attrs, false, is_dynamic); enter_count++; CHECK_EQ(poly_res.size(), 2); stmt = air::Downcast(poly_res[0]); Array tiling_params = air::Downcast>(poly_res[1]); for (const auto &var : tiling_params) { arg_list_0.push_back(var); } if (global_attrs.GetBoolAttr(kTileSizeIsVar, false)) { Array arg_list_2; Map binds_2; FixParametricBinds(binds_0, arg_list_0, config, &binds_2, &arg_list_2); stmt = NEXT_PASS(FixBindBuffer, stmt, binds_2); arg_list_0 = arg_list_2; binds_0 = binds_2; } if (is_dynamic) { if (global_attrs.GetBoolAttr(kEnableSubstituteDivVar, false)) { stmt = NEXT_PASS(SubstituteDivVar, stmt); } // fix var addresses because poly identify vars by name stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0); // isolate dynamic tile loops (isolate body and tail) if (global_attrs.GetBoolAttr(kEnableIsolateLoop, true)) { stmt = NEXT_PASS(IsolateLoops, stmt, global_attrs.GetBoolAttr(kEnableIsolateMinMax, false)); stmt = NEXT_PASS(PromoteLetStmt, stmt, arg_list_0); } } // pls do not insert pass between AutoPoly and cube special pass. // cube special pass begin stmt = NEXT_PASS(ExprPatternRewrite, stmt); stmt = NEXT_PASS(AutoMadPragmaAttr, stmt); stmt = NEXT_PASS(PostFusion, stmt, binds_0, is_dynamic); stmt = NEXT_PASS(ReduceFusionOpt, stmt, binds_0); stmt = NEXT_PASS(PostProcessImg2col, stmt); stmt = NEXT_PASS(PromoteIfStmt, stmt, is_dynamic); stmt = NEXT_PASS(BypassL1, stmt); if (global_attrs.GetBoolAttr(kEnableStrideKernelOp, true)) { stmt = NEXT_PASS(StrideKernelOp, stmt, binds_0, is_dynamic); } stmt = NEXT_PASS(Load3dTrans, stmt, is_dynamic); // cube special pass end stmt = NEXT_PASS(CopyPropagation, stmt, binds_0); stmt = NEXT_PASS(ConvertCondToExtent, stmt); bool enable_convert_if = global_attrs.GetBoolAttr(kEnableConvertIf, false); if (enable_convert_if) { stmt = NEXT_PASS(FixRealizeShape, stmt); } if (global_attrs.GetBoolAttr(kEnableDmaSink, false)) { stmt = NEXT_PASS(DMASink, stmt); } stmt = NEXT_PASS(LowerWith, stmt); stmt = NEXT_PASS(ForEliminate, stmt); stmt = NEXT_PASS(RealizeCompress, stmt); if (!global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) { stmt = NEXT_PASS(LoopNormlize, stmt); } stmt = NEXT_PASS(PoolingTransform, stmt, is_dynamic); stmt = NEXT_PASS(InjectAttr, stmt); stmt = NEXT_PASS(ModDivEliminate, stmt); if (enable_convert_if) { stmt = NEXT_PASS(AlignLastAxisLoopExtent, stmt, binds_0); stmt = NEXT_PASS(FixLoopExtent, stmt); stmt = NEXT_PASS(ConvertIfToSelect, stmt); } } try { stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64); } catch (const std::runtime_error &e) { if (enter_count >= max_enter_poly_times) { CHECK(false) << e.what(); } global_attrs.Set(kErrorInfo, StringImm::make(e.what())); continue; } stmt = NEXT_PASS(DmaFlatten, stmt, global_attrs.GetBoolAttr(kTileSizeIsVar, false)); if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) { stmt = NEXT_PASS(AlgebraSimplify, stmt); } if (is_dynamic) { stmt = NEXT_PASS(UnifyAllocate, stmt); } if (global_attrs.GetBoolAttr(kEleminateOutmostForCond, false)) { stmt = NEXT_PASS(PreProcess4Multicore, stmt); } int enable_multicore = global_attrs.GetIntAttr(kEnableMulticore, 1); if (!is_dynamic && enable_multicore != 0 && global_attrs.GetBoolAttr(kMultiCoreLoopSwitchHoist, true)) { stmt = NEXT_PASS(MultiCoreLoopSwitchHoist, stmt); } stmt = NEXT_PASS(LoopSwitchHoist, stmt, global_attrs.GetIntAttr(kEnableHoistAllocate, false)); // Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning, // 4 : whether to partition convolution or not if (target != "aicpu" && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) { stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral); } if (polyhedral && global_attrs.GetBoolAttr(kEnableSinkAllocate, true)) { stmt = NEXT_PASS(SinkAllocate, stmt); } if (global_attrs.GetBoolAttr(kLoopPartitionUnroll, false)) { // For the Manual scheduling or When polyhedral is not used stmt = NEXT_PASS(UnrollNonConstantExtent, stmt); } if (!polyhedral) { // fix mad attributes and remove dead computations for the manual schedule stmt = NEXT_PASS(FixMadAttrs, stmt); } if (!is_dynamic) { stmt = NEXT_PASS(CanonicalSimplify, stmt); } stmt = NEXT_PASS(ForEliminate, stmt); if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) { stmt = NEXT_PASS(AlgebraSimplify, stmt); } if (!is_dynamic) { stmt = NEXT_PASS(FixLoopExtent, stmt); } if (target != "aicpu") { stmt = NEXT_PASS(AutoPragma, stmt); } stmt = NEXT_PASS(EliminateAtomicDma, stmt); if (global_attrs.GetBoolAttr(kDeadCodeElim, false)) { stmt = NEXT_PASS(DeadCodeElim, stmt); } if (is_dynamic) { stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true), global_attrs.GetIntAttr(kEnableScalarAlign, false)); } else { stmt = NEXT_PASS(RewriteBroadcastVector, stmt); stmt = NEXT_PASS(OptimizePragma, stmt); stmt = NEXT_PASS(MergeLoops, stmt, false); stmt = NEXT_PASS(PackStore, stmt); stmt = NEXT_PASS(AnalyzeMinAlignStatic, stmt); stmt = NEXT_PASS(RecoverStore, stmt); } stmt = NEXT_PASS(MultiLastAxisReductions, stmt, is_dynamic); stmt = NEXT_PASS(AutoReorder, stmt); if (enable_multicore != 0) { if (is_dynamic && enable_multicore == 1) { Var block_dim = Variable::make(Int(32), "blockDim"); Array multicore_res = NEXT_PASS(InjectMultiCoreVar, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0)); CHECK_EQ(multicore_res.size(), 2); stmt = air::Downcast(multicore_res[0]); auto extent_thread = air::Downcast(multicore_res[1]); if (extent_thread.as()->value == -1) { arg_list_0.push_back(block_dim); } } else { int block_dim = enable_multicore == 1 ? -1 : enable_multicore; stmt = NEXT_PASS(InjectMultiCore, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0), is_dynamic, global_attrs.GetBoolAttr(kMultiCoreScalarRerrange, false)); } } if (!is_dynamic) { RecordCore(stmt, global_attrs.GetBoolAttr(kRecordCore, false)); } stmt = NEXT_PASS(SelectLower, stmt); stmt = NEXT_PASS(ReplaceFargmaxCasts, stmt); if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) { stmt = NEXT_PASS(GatherLoopInfo, stmt); } stmt = NEXT_PASS(CastFilter, stmt); if (!is_dynamic) { stmt = NEXT_PASS(SplitTail, stmt); } stmt = NEXT_PASS(EmitInsn, stmt, global_attrs.GetBoolAttr(kEnableBisectOptimize, true), global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true), binds_0, is_dynamic); // must be after EmitInsn stmt = NEXT_PASS(TileCoverCorrect, stmt); if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) { // simulated blocks > 240 000 => simulated case takes too much time (> 10 sec) // number of protections > 128 => too many brackets in the if statement throw an error stmt = NEXT_PASS(CoverProtection, stmt, 240000, 128); } stmt = NEXT_PASS(ConvertDivModToShift, stmt); if (!polyhedral || global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) { // for conv manual schedule and load3d stmt = NEXT_PASS(CoarsenImg2Col, stmt); } stmt = NEXT_PASS(DTypeAdapter, stmt); if (global_attrs.GetBoolAttr(kEnableHoistInsn, true)) { stmt = NEXT_PASS(HoistInsn, stmt); } // temp disable InvariantHoist for dynamic shape because it may move LetStmt out of scope if (global_attrs.GetBoolAttr(kEnableInvariantHoist, true)) { stmt = NEXT_PASS(InvariantHoist, stmt); } stmt = NEXT_PASS(SetVectorMaskDefault, stmt); stmt = NEXT_PASS(ElimVectorMask, stmt); stmt = NEXT_PASS(ElimDMA, stmt); if (!is_dynamic) { stmt = NEXT_PASS(MultiCorePartition, stmt); } if (global_attrs.GetBoolAttr(kEnableDoubleBuffer, true)) { stmt = NEXT_PASS(AutoDoubleBuffer, stmt); } stmt = NEXT_PASS(InjectAccessPtrMSG, stmt); if (target != "aicpu") { stmt = NEXT_PASS(InjectPipe, stmt); } stmt = NEXT_PASS(ModDivEliminate, stmt); // Phase 2 if (!simple_mode && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true) && !is_dynamic) { stmt = NEXT_PASS(LoopPartitionCCE, stmt, config->partition_const_loop, true, !polyhedral); } if (global_attrs.GetBoolAttr(kEnablePreStorageWriteSimplify, false)) { stmt = NEXT_PASS(AlgebraSimplify, stmt); } std::string maxsat_filename = global_attrs.GetStringAttr(kMaxsatFile, std::string()); // attempt to optimize UB memory layout to reduce bank conflicts and pipeline conflicts bool use_bc_opt = global_attrs.GetBoolAttr(kUseBcOpt, true); // run MaxSAT solver for bank conflicts with no limits on model size or runtime bool bc_no_limits = false; // timeout for MaxSAT solver in seconds (int) int maxsat_timeout = 4; try { stmt = NEXT_PASS(StorageRewriteCCE, stmt, maxsat_filename, use_bc_opt, bc_no_limits, maxsat_timeout); } catch (MemoryAllocationException &e) { if (enter_count >= max_enter_poly_times) { CHECK(false) << e.what(); } global_attrs.Set(kAllocBits, air::make_const(Int(32), e.alloc_bits_ + e.need_bits_)); global_attrs.Set(kErrorScope, StringImm::make(e.scope_)); continue; } break; } if (!is_dynamic) stmt = NEXT_PASS(UnrollLoop, stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth, config->auto_unroll_max_extent, config->unroll_explicit); stmt = NEXT_PASS(SpecialValueReplacer, stmt); stmt = NEXT_PASS(Simplify, stmt); if (target != "aicpu") { stmt = NEXT_PASS(InjectSync, stmt); } // Phase 3 stmt = NEXT_PASS(RemoveAccessPtrMSG, stmt); if (is_dynamic) { // check undefined loop vars stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0); stmt = NEXT_PASS(PromoteLetStmt, stmt, arg_list_0); if (global_attrs.GetBoolAttr(kPromoteCommonExpr, true)) { stmt = NEXT_PASS(PromoteCommonExpr, stmt); } if (global_attrs.GetBoolAttr(kPromoteConstExpr, true)) { stmt = NEXT_PASS(PromoteConstExpr, stmt); } } stmt = NEXT_PASS(Simplify, stmt); stmt = NEXT_PASS(LowerStorageAccessInfoCCE, stmt); if (is_dynamic) { stmt = NEXT_PASS(RewriteFloorDiv, stmt); stmt = NEXT_PASS(RemoveAssert, stmt); } stmt = NEXT_PASS(RemoveNoOp, stmt); if (is_dynamic) { stmt = NEXT_PASS(SpecifyMinMaxDataType, stmt); } if (!config->disable_select_rewriting) { stmt = NEXT_PASS(RewriteUnsafeSelect, stmt); } if (is_dynamic) { Array collect_res = NEXT_PASS(CollectExternalCall, stmt); CHECK_EQ(collect_res.size(), 2); stmt = air::Downcast(collect_res[0]); g_external_call_name = air::Downcast>(collect_res[1]); // CastKernelParams should be before DecorateDeviceScope Array cast_res = NEXT_PASS(CastKernelParams, stmt, arg_list_0); CHECK_EQ(cast_res.size(), 2); stmt = air::Downcast(cast_res[0]); arg_list_0 = air::Downcast>(cast_res[1]); } stmt = NEXT_PASS(DecorateDeviceScope, stmt); // Instrument BoundCheckers if (config->instrument_bound_checkers) { stmt = NEXT_PASS(InstrumentBoundCheckers, stmt); } if (simple_mode) { return stmt; } PassMgr::SetArgs(arg_list_0); LoweredFunc lowered_func = NEXT_PASS(MakeAPI, stmt, name, arg_list_0, 0, config->restricted_func); LOG(INFO) << *pass_timer; pass_timer->Clear(); return lowered_func; } void BuildForDevice(const Array &flist, const std::string &target_name, const std::string &target_host_name, Array *out_flist, air::runtime::Module *out_mdev) { CHECK(out_flist != nullptr) << "out_flist is nullptr."; CHECK(out_mdev != nullptr) << "out_mdev is nullptr."; Target target = Target::Create(target_name); TVMContext context{kDLCce, 0}; DLDeviceType device_type = context.device_type; Array fhost; Array fdevice; for (auto func : flist) { if (func->func_type == air::LoweredFuncType::kMixedFunc) { if (target_name == "cuda") { if (BuildConfig::Current()->detect_global_barrier) { func = NEXT_PASS(ThreadSync, func, "global"); } func = NEXT_PASS(ThreadSync, func, "shared"); func = NEXT_PASS(ThreadSync, func, "warp"); func = NEXT_PASS(InferFragment, func); func = NEXT_PASS(LowerThreadAllreduce, func, target->thread_warp_size); } Array fsplits = NEXT_PASS(SplitHostDevice, func); fhost.push_back(fsplits[0]); for (size_t idx = 1; idx < fsplits.size(); idx++) { fdevice.push_back(fsplits[idx]); } } else if (func->func_type == air::LoweredFuncType::kHostFunc) { fhost.push_back(func); } else if (func->func_type == air::LoweredFuncType::kDeviceFunc) { fdevice.push_back(func); } else { LOG(FATAL) << "unknown function type " << func->func_type; } } if (target_name == "cuda") { for (size_t i = 0; i < fdevice.size(); ++i) { fdevice.Set(i, NEXT_PASS(LowerWarpMemory, fdevice[i], target->thread_warp_size)); } } for (size_t i = 0; i < fhost.size(); ++i) { fhost.Set(i, NEXT_PASS(BindDeviceType, fhost[i], static_cast(device_type))); fhost.Set(i, NEXT_PASS(LowerTVMBuiltin, fhost[i])); } Target target_host = Target::Create(target_host_name); for (size_t i = 0; i < fdevice.size(); ++i) { if (target_name == "cuda") { fdevice.Set(i, NEXT_PASS(LowerDeviceStorageAccessInfo, fdevice[i])); } fdevice.Set(i, NEXT_PASS(LowerIntrin, fdevice[i], target->target_name)); } for (size_t i = 0; i < fhost.size(); ++i) { if (target_name == "cuda") { fhost.Set(i, NEXT_PASS(LowerDeviceStorageAccessInfo, fhost[i])); } fhost.Set(i, NEXT_PASS(LowerIntrin, fhost[i], target_host->target_name)); fhost.Set(i, NEXT_PASS(CombineContextCall, fhost[i])); } for (const auto &func : fhost) { out_flist->push_back(func); } *out_mdev = air::codegen::Build(fdevice, target_name, g_external_call_name); return; } BuildRst BuildRstNode::make(const NodeRef &rst, const std::string &kernel_name) { NodePtr node = make_node(); node->rst = rst; node->kernel_name = kernel_name; return BuildRst(node); } TVM_REGISTER_NODE_TYPE(BuildRstNode); BuildRst BuildToFunc(const Schedule &inputs, const Array &in_args, const Array &shape_vars, const std::string &name, const Map &in_binds, const Map &in_attrs, bool polyhedral, const std::string &target, const BuildConfig &config) { CHECK(inputs.defined()) << "inputs is not defined."; CHECK(!name.empty()) << "name is empty."; Array args; if (in_args.defined()) { args = in_args; } Map binds; if (in_binds.defined()) { binds = in_binds; } Map attrs; if (in_attrs.defined()) { attrs = in_attrs; } auto rst = Lower(inputs, args, shape_vars, name, binds, attrs, false, polyhedral, false, target, config); return BuildRstNode::make(rst, name); } namespace { void CreateCce(const std::string &code, const std::string &kernel_name) { std::string file_name = kMsDavinciKernelPath; file_name.append(kernel_name).append(".cce"); std::ofstream of(file_name); CHECK(of.is_open()) << "Failed to open " << file_name << " to dump cce."; of << code << std::endl; of.close(); } } // namespace air::runtime::Module BuildToModule(const NodeRef &ref, const std::string &target_name) { CHECK(!target_name.empty()) << "target_name is empty."; auto build_rst = Downcast(ref); auto res = build_rst->rst; Array lowered_func_list; if (res->IsInstance()) { LoweredFunc lowered_func = air::Downcast(res); lowered_func_list.push_back(lowered_func); } if (lowered_func_list.empty()) { return air::runtime::Module(nullptr); } Map> target_flist; target_flist.Set(target_name, lowered_func_list); Array fhost_all; std::vector device_modules; for (auto iter : target_flist) { Array out_flist; air::runtime::Module out_mdev; BuildForDevice(iter.second, iter.first, kAkgTargetHostName, &out_flist, &out_mdev); // Save the current lowered functions of the host and the device module. for (const auto &func : out_flist) { fhost_all.push_back(func); } device_modules.push_back(out_mdev); } // Generate a unified host module. air::runtime::Module mhost = air::codegen::Build(fhost_all, kAkgTargetHostName, g_external_call_name); // Import all modules. for (const auto &mdev : device_modules) { mhost.Import(mdev); } const char *akg_dump_cce = getenv("MS_AKG_DUMP_CCE"); if (akg_dump_cce != nullptr) { auto mod0 = mhost->imports()[0]; CHECK(mod0.defined()); CreateCce(mod0->GetSource(), build_rst->kernel_name); } return mhost; } air::runtime::Module BuildModule(const Schedule &inputs, const Array &in_args, const Array &shape_vars, const std::string &target_name, const std::string &name, const Map &in_binds, const Map &in_attrs, bool polyhedral, const std::string &target, const BuildConfig &config) { auto func = BuildToFunc(inputs, in_args, shape_vars, name, in_binds, in_attrs, polyhedral, target, config); return BuildToModule(func, target_name); } TVM_REGISTER_API("_BuildModule").set_body_typed(BuildModule); TVM_REGISTER_API("_BuildToFunc").set_body_typed(BuildToFunc); TVM_REGISTER_API("_BuildToModule").set_body([](const TVMArgs &args, TVMRetValue *ret) { if (args.size() == 1) { *ret = BuildToModule(args[0]); } else if (args.size() == 2) { *ret = BuildToModule(args[0], args[1]); } else { LOG(FATAL) << "arg num must be 1 or 2, but given " << args.size(); } }); TVM_REGISTER_API("_Lower").set_body([](const TVMArgs &args, TVMRetValue *ret) { if (args.size() == 11) { NodeRef lowered_func = Lower(args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10]); *ret = lowered_func; } }); TVM_REGISTER_API("akg.build_module.get_binds").set_body([](const TVMArgs &args, TVMRetValue *ret) { auto config = BuildConfig::Current(); Array inputs; Map binds; if (args.size() >= 1) { inputs = args[0]; } else if (args.size() >= 2) { inputs = args[0]; binds = args[1]; } Array out_inputs; Map out_binds; GetBinds(inputs, binds, config, &out_inputs, &out_binds); *ret = Array{out_binds, out_inputs}; }); } // namespace akg