提交 4d1be48d 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!53 support dynamical memory allocation ratio adjustment in micro-tuning for...

!53 support dynamical memory allocation ratio adjustment in micro-tuning for allocation exceed problem
Merge pull request !53 from yangsijia/feature/micro-tuning
......@@ -116,8 +116,8 @@ def four2five_tiling_strategy_dynamic(tensor, input_format):
strategy.append(ct_util.create_constraint_on_tensor(tensor, 16, ct_util.TileConstraint.FACTOR, 4)[0])
return strategy
@vc_util.check_input_type(akg.tvm.tensor.Tensor, str, str)
def four2five(data, format_, dst_dtype='float16'):
@vc_util.check_input_type(akg.tvm.tensor.Tensor, str, str, bool)
def four2five(data, format_, dst_dtype='float16', need_custom_tiling=True):
"""
Convert 4-dims "data" to 5-dims,the format of "data" is defined in "format_"
......@@ -294,8 +294,9 @@ def four2five(data, format_, dst_dtype='float16'):
dim_info, _ = four2five_set_dim_func(data, format_, dst_dtype)
if dim_info != "":
attrs["dim"] = dim_info
attrs["custom_tiling"] = four2five_tiling_strategy(output, format_, expansion)
else:
if need_custom_tiling:
attrs["custom_tiling"] = four2five_tiling_strategy(output, format_, expansion)
elif need_custom_tiling:
attrs["custom_tiling"] = four2five_tiling_strategy_dynamic(output, format_)
if is_dynamic:
......
......@@ -458,7 +458,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
PassTimer *pass_timer = PassTimer::GetInstance();
global_attrs.Set(kKernelName, StringImm::make(name));
global_attrs.Set(kDumpPassIr, ktvm::make_const(Int(1), config->dump_pass_ir));
global_attrs.Set(kDumpPassIr, ktvm::make_const(Int(32), config->dump_pass_ir));
if (config->dump_pass_ir) {
std::string dump_ir_dir;
if (global_attrs.GetStringAttr(kDumpIrDir, &dump_ir_dir)) {
......@@ -498,7 +498,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
stmt = NEXT_PASS(RenameRealize, stmt, binds_0, replace);
bool is_dynamic = !shape_vars.empty();
global_attrs.Set(kIsDynamic, ktvm::make_const(Int(1), is_dynamic));
global_attrs.Set(kIsDynamic, ktvm::make_const(Int(32), is_dynamic));
Array<NodeRef> arg_list_1;
Map<Tensor, Buffer> binds_1;
......@@ -594,227 +594,255 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
NodeRef tuning_spaces = NEXT_PASS(GenTuningSpace, stmt, binds_0, attrs_1, false);
return tuning_spaces;
}
Array<NodeRef> poly_res = NEXT_PASS(AutoPoly, stmt, binds_0, global_attrs, false, is_dynamic);
CHECK_EQ(poly_res.size(), 2);
stmt = ktvm::Downcast<Stmt>(poly_res[0]);
Array<ktvm::Var> tiling_params = ktvm::Downcast<Array<ktvm::Var>>(poly_res[1]);
for (const auto &var : tiling_params) {
arg_list_0.push_back(var);
}
}
if (global_attrs.GetBoolAttr(kTileSizeIsVar, false)) {
Array<NodeRef> arg_list_2;
Map<Tensor, Buffer> binds_2;
FixParametricBinds(binds_0, arg_list_0, config, &binds_2, &arg_list_2);
stmt = NEXT_PASS(FixBindBuffer, stmt, binds_2);
arg_list_0 = arg_list_2;
binds_0 = binds_2;
}
// micro-tuning configs: current strategy is to retry autopoly up to 3 times when storage flatten/rewrite fails
bool need_micro_tuning = !aicpu && polyhedral && !is_dynamic && global_attrs.GetStringAttr("dim", "").empty();
const int max_enter_poly_times = global_attrs.GetIntAttr(kMaxNumRetryPoly, need_micro_tuning ? 4 : 1);
int enter_count = 0;
Stmt stmt_before_poly = stmt;
while (enter_count < max_enter_poly_times) {
if (!aicpu && polyhedral) {
Array<NodeRef> poly_res = NEXT_PASS(AutoPoly, stmt_before_poly, binds_0, global_attrs, false, is_dynamic);
enter_count++;
CHECK_EQ(poly_res.size(), 2);
stmt = ktvm::Downcast<Stmt>(poly_res[0]);
Array<ktvm::Var> tiling_params = ktvm::Downcast<Array<ktvm::Var>>(poly_res[1]);
for (const auto &var : tiling_params) {
arg_list_0.push_back(var);
}
if (is_dynamic) {
if (global_attrs.GetBoolAttr(kEnableSubstituteDivVar, false)) {
stmt = NEXT_PASS(SubstituteDivVar, stmt);
if (global_attrs.GetBoolAttr(kTileSizeIsVar, false)) {
Array<NodeRef> arg_list_2;
Map<Tensor, Buffer> binds_2;
FixParametricBinds(binds_0, arg_list_0, config, &binds_2, &arg_list_2);
stmt = NEXT_PASS(FixBindBuffer, stmt, binds_2);
arg_list_0 = arg_list_2;
binds_0 = binds_2;
}
// fix var addresses because poly identify vars by name
stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0);
// isolate dynamic tile loops (isolate body and tail)
if (global_attrs.GetBoolAttr(kEnableIsolateLoop, true)) {
stmt = NEXT_PASS(IsolateLoops, stmt, global_attrs.GetBoolAttr(kEnableIsolateMinMax, false));
stmt = NEXT_PASS(PromoteLetStmt, stmt, arg_list_0);
if (is_dynamic) {
if (global_attrs.GetBoolAttr(kEnableSubstituteDivVar, false)) {
stmt = NEXT_PASS(SubstituteDivVar, stmt);
}
// fix var addresses because poly identify vars by name
stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0);
// isolate dynamic tile loops (isolate body and tail)
if (global_attrs.GetBoolAttr(kEnableIsolateLoop, true)) {
stmt = NEXT_PASS(IsolateLoops, stmt, global_attrs.GetBoolAttr(kEnableIsolateMinMax, false));
stmt = NEXT_PASS(PromoteLetStmt, stmt, arg_list_0);
}
}
}
// pls do not insert pass between AutoPoly and cube special pass.
// cube special pass begin
stmt = NEXT_PASS(ExprPatternRewrite, stmt);
stmt = NEXT_PASS(AutoMadPragmaAttr, stmt);
stmt = NEXT_PASS(PostFusion, stmt, binds_0, is_dynamic);
stmt = NEXT_PASS(ReduceFusionOpt, stmt, binds_0);
stmt = NEXT_PASS(PostProcessImg2col, stmt);
stmt = NEXT_PASS(PromoteIfStmt, stmt, is_dynamic);
stmt = NEXT_PASS(BypassL1, stmt);
if (global_attrs.GetBoolAttr(kEnableStrideKernelOp, true)) {
stmt = NEXT_PASS(StrideKernelOp, stmt, binds_0, is_dynamic);
}
stmt = NEXT_PASS(Load3dTrans, stmt, is_dynamic);
// cube special pass end
stmt = NEXT_PASS(CopyPropagation, stmt, binds_0);
stmt = NEXT_PASS(ConvertCondToExtent, stmt);
bool enable_convert_if = global_attrs.GetBoolAttr(kEnableConvertIf, false);
if (enable_convert_if) {
stmt = NEXT_PASS(FixRealizeShape, stmt);
}
if (global_attrs.GetBoolAttr(kEnableDmaSink, false)) {
stmt = NEXT_PASS(DMASink, stmt);
}
// pls do not insert pass between AutoPoly and cube special pass.
// cube special pass begin
stmt = NEXT_PASS(ExprPatternRewrite, stmt);
stmt = NEXT_PASS(AutoMadPragmaAttr, stmt);
stmt = NEXT_PASS(PostFusion, stmt, binds_0, is_dynamic);
stmt = NEXT_PASS(ReduceFusionOpt, stmt, binds_0);
stmt = NEXT_PASS(PostProcessImg2col, stmt);
stmt = NEXT_PASS(PromoteIfStmt, stmt, is_dynamic);
stmt = NEXT_PASS(BypassL1, stmt);
if (global_attrs.GetBoolAttr(kEnableStrideKernelOp, true)) {
stmt = NEXT_PASS(StrideKernelOp, stmt, binds_0, is_dynamic);
}
stmt = NEXT_PASS(Load3dTrans, stmt, is_dynamic);
// cube special pass end
stmt = NEXT_PASS(CopyPropagation, stmt, binds_0);
stmt = NEXT_PASS(ConvertCondToExtent, stmt);
bool enable_convert_if = global_attrs.GetBoolAttr(kEnableConvertIf, false);
if (enable_convert_if) {
stmt = NEXT_PASS(FixRealizeShape, stmt);
}
if (global_attrs.GetBoolAttr(kEnableDmaSink, false)) {
stmt = NEXT_PASS(DMASink, stmt);
}
stmt = NEXT_PASS(LowerWith, stmt);
stmt = NEXT_PASS(ForEliminate, stmt);
stmt = NEXT_PASS(RealizeCompress, stmt);
stmt = NEXT_PASS(LowerWith, stmt);
stmt = NEXT_PASS(ForEliminate, stmt);
stmt = NEXT_PASS(RealizeCompress, stmt);
if (!global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) {
stmt = NEXT_PASS(LoopNormlize, stmt);
if (!global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) {
stmt = NEXT_PASS(LoopNormlize, stmt);
}
stmt = NEXT_PASS(PoolingTransform, stmt, is_dynamic);
stmt = NEXT_PASS(InjectAttr, stmt);
stmt = NEXT_PASS(ModDivEliminate, stmt);
if (enable_convert_if) {
stmt = NEXT_PASS(AlignLastAxisLoopExtent, stmt, binds_0);
stmt = NEXT_PASS(FixLoopExtent, stmt);
stmt = NEXT_PASS(ConvertIfToSelect, stmt);
}
}
stmt = NEXT_PASS(PoolingTransform, stmt, is_dynamic);
stmt = NEXT_PASS(InjectAttr, stmt);
stmt = NEXT_PASS(ModDivEliminate, stmt);
if (enable_convert_if) {
stmt = NEXT_PASS(AlignLastAxisLoopExtent, stmt, binds_0);
stmt = NEXT_PASS(FixLoopExtent, stmt);
stmt = NEXT_PASS(ConvertIfToSelect, stmt);
try {
stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64);
} catch (const std::runtime_error &e) {
if (enter_count >= max_enter_poly_times) {
CHECK(false) << e.what();
}
global_attrs.Set(kErrorInfo, StringImm::make(e.what()));
continue;
}
stmt = NEXT_PASS(DmaFlatten, stmt, global_attrs.GetBoolAttr(kTileSizeIsVar, false));
if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) {
stmt = NEXT_PASS(AlgebraSimplify, stmt);
}
if (is_dynamic) {
stmt = NEXT_PASS(UnifyAllocate, stmt);
}
}
stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64);
stmt = NEXT_PASS(DmaFlatten, stmt, global_attrs.GetBoolAttr(kTileSizeIsVar, false));
if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) {
stmt = NEXT_PASS(AlgebraSimplify, stmt);
}
if (is_dynamic) {
stmt = NEXT_PASS(UnifyAllocate, stmt);
}
if (global_attrs.GetBoolAttr(kEleminateOutmostForCond, false)) {
stmt = NEXT_PASS(PreProcess4Multicore, stmt);
}
if (global_attrs.GetBoolAttr(kEleminateOutmostForCond, false)) {
stmt = NEXT_PASS(PreProcess4Multicore, stmt);
}
int enable_multicore = global_attrs.GetIntAttr(kEnableMulticore, 1);
if (!is_dynamic && enable_multicore != 0 && global_attrs.GetBoolAttr(kMultiCoreLoopSwitchHoist, true)) {
stmt = NEXT_PASS(MultiCoreLoopSwitchHoist, stmt);
}
stmt = NEXT_PASS(LoopSwitchHoist, stmt, global_attrs.GetIntAttr(kEnableHoistAllocate, false));
int enable_multicore = global_attrs.GetIntAttr(kEnableMulticore, 1);
if (!is_dynamic && enable_multicore != 0 && global_attrs.GetBoolAttr(kMultiCoreLoopSwitchHoist, true)) {
stmt = NEXT_PASS(MultiCoreLoopSwitchHoist, stmt);
}
stmt = NEXT_PASS(LoopSwitchHoist, stmt, global_attrs.GetIntAttr(kEnableHoistAllocate, false));
// Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning,
// 4 : whether to partition convolution or not
if (!aicpu && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) {
stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral);
}
// Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning,
// 4 : whether to partition convolution or not
if (!aicpu && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) {
stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral);
}
if (polyhedral && global_attrs.GetBoolAttr(kEnableSinkAllocate, true)) {
stmt = NEXT_PASS(SinkAllocate, stmt);
}
if (polyhedral && global_attrs.GetBoolAttr(kEnableSinkAllocate, true)) {
stmt = NEXT_PASS(SinkAllocate, stmt);
}
if (global_attrs.GetBoolAttr(kLoopPartitionUnroll, false)) {
// For the Manual scheduling or When polyhedral is not used
stmt = NEXT_PASS(UnrollNonConstantExtent, stmt);
}
if (!polyhedral) {
// fix mad attributes and remove dead computations for the manual schedule
stmt = NEXT_PASS(FixMadAttrs, stmt);
}
if (!is_dynamic) {
stmt = NEXT_PASS(CanonicalSimplify, stmt);
}
stmt = NEXT_PASS(ForEliminate, stmt);
if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) {
stmt = NEXT_PASS(AlgebraSimplify, stmt);
}
if (!is_dynamic) {
stmt = NEXT_PASS(FixLoopExtent, stmt);
}
if (global_attrs.GetBoolAttr(kLoopPartitionUnroll, false)) {
// For the Manual scheduling or When polyhedral is not used
stmt = NEXT_PASS(UnrollNonConstantExtent, stmt);
}
if (!polyhedral) {
// fix mad attributes and remove dead computations for the manual schedule
stmt = NEXT_PASS(FixMadAttrs, stmt);
}
if (!is_dynamic) {
stmt = NEXT_PASS(CanonicalSimplify, stmt);
}
stmt = NEXT_PASS(ForEliminate, stmt);
if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) {
stmt = NEXT_PASS(AlgebraSimplify, stmt);
}
if (!is_dynamic) {
stmt = NEXT_PASS(FixLoopExtent, stmt);
}
if (!aicpu) {
stmt = NEXT_PASS(AutoPragma, stmt);
}
stmt = NEXT_PASS(EliminateAtomicDma, stmt);
if (global_attrs.GetBoolAttr(kDeadCodeElim, false)) {
stmt = NEXT_PASS(DeadCodeElim, stmt);
}
if (!is_dynamic) {
stmt = NEXT_PASS(RewriteBroadcastVector, stmt);
stmt = NEXT_PASS(OptimizePragma, stmt);
}
if (is_dynamic) {
stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true),
global_attrs.GetIntAttr(kEnableScalarAlign, false));
} else {
stmt = NEXT_PASS(AnalyzeMinAlignStatic, stmt);
}
stmt = NEXT_PASS(MultiLastAxisReductions, stmt, is_dynamic);
stmt = NEXT_PASS(AutoReorder, stmt);
if (enable_multicore != 0) {
if (is_dynamic && enable_multicore == 1) {
Var block_dim = Variable::make(Int(32), "blockDim");
Array<NodeRef> multicore_res =
NEXT_PASS(InjectMultiCoreVar, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0));
CHECK_EQ(multicore_res.size(), 2);
stmt = ktvm::Downcast<Stmt>(multicore_res[0]);
auto extent_thread = ktvm::Downcast<Integer>(multicore_res[1]);
if (extent_thread.as<IntImm>()->value == -1) {
arg_list_0.push_back(block_dim);
}
if (!aicpu) {
stmt = NEXT_PASS(AutoPragma, stmt);
}
stmt = NEXT_PASS(EliminateAtomicDma, stmt);
if (global_attrs.GetBoolAttr(kDeadCodeElim, false)) {
stmt = NEXT_PASS(DeadCodeElim, stmt);
}
if (!is_dynamic) {
stmt = NEXT_PASS(RewriteBroadcastVector, stmt);
stmt = NEXT_PASS(OptimizePragma, stmt);
}
if (is_dynamic) {
stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true),
global_attrs.GetIntAttr(kEnableScalarAlign, false));
} else {
int block_dim = enable_multicore == 1 ? -1 : enable_multicore;
stmt = NEXT_PASS(InjectMultiCore, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0), is_dynamic,
global_attrs.GetBoolAttr(kMultiCoreScalarRerrange, false));
stmt = NEXT_PASS(AnalyzeMinAlignStatic, stmt);
}
stmt = NEXT_PASS(MultiLastAxisReductions, stmt, is_dynamic);
stmt = NEXT_PASS(AutoReorder, stmt);
if (enable_multicore != 0) {
if (is_dynamic && enable_multicore == 1) {
Var block_dim = Variable::make(Int(32), "blockDim");
Array<NodeRef> multicore_res =
NEXT_PASS(InjectMultiCoreVar, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0));
CHECK_EQ(multicore_res.size(), 2);
stmt = ktvm::Downcast<Stmt>(multicore_res[0]);
auto extent_thread = ktvm::Downcast<Integer>(multicore_res[1]);
if (extent_thread.as<IntImm>()->value == -1) {
arg_list_0.push_back(block_dim);
}
} else {
int block_dim = enable_multicore == 1 ? -1 : enable_multicore;
stmt = NEXT_PASS(InjectMultiCore, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0), is_dynamic,
global_attrs.GetBoolAttr(kMultiCoreScalarRerrange, false));
}
}
if (!is_dynamic) {
RecordCore(stmt, global_attrs.GetBoolAttr(kRecordCore, false));
}
stmt = NEXT_PASS(SelectLower, stmt);
stmt = NEXT_PASS(ReplaceFargmaxCasts, stmt);
if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) {
stmt = NEXT_PASS(GatherLoopInfo, stmt);
}
stmt = NEXT_PASS(CastFilter, stmt);
if (!is_dynamic) {
stmt = NEXT_PASS(SplitTail, stmt);
}
stmt = NEXT_PASS(EmitInsn, stmt, global_attrs.GetBoolAttr(kEnableBisectOptimize, true),
global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true), binds_0, is_dynamic);
// must be after EmitInsn
stmt = NEXT_PASS(TileCoverCorrect, stmt);
if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) {
// simulated blocks > 2 400 000 => simulated case takes too much time (> 100 sec)
// number of protections > 512 => too many brackets in the if statement throw an error
stmt = NEXT_PASS(CoverProtection, stmt, 2400000, 512);
}
stmt = NEXT_PASS(ConvertDivModToShift, stmt);
if (!polyhedral || global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) {
// for conv manual schedule and load3d
stmt = NEXT_PASS(CoarsenImg2Col, stmt);
}
stmt = NEXT_PASS(DTypeAdapter, stmt);
if (global_attrs.GetBoolAttr(kEnableHoistInsn, true)) {
stmt = NEXT_PASS(HoistInsn, stmt);
}
// temp disable InvariantHoist for dynamic shape because it may move LetStmt out of scope
if (global_attrs.GetBoolAttr(kEnableInvariantHoist, true)) {
stmt = NEXT_PASS(InvariantHoist, stmt);
}
stmt = NEXT_PASS(SetVectorMaskDefault, stmt);
stmt = NEXT_PASS(ElimVectorMask, stmt);
stmt = NEXT_PASS(ElimDMA, stmt);
if (!is_dynamic) {
stmt = NEXT_PASS(MultiCorePartition, stmt);
}
}
if (!is_dynamic) {
RecordCore(stmt, global_attrs.GetBoolAttr(kRecordCore, false));
}
stmt = NEXT_PASS(SelectLower, stmt);
stmt = NEXT_PASS(ReplaceFargmaxCasts, stmt);
if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) {
stmt = NEXT_PASS(GatherLoopInfo, stmt);
}
stmt = NEXT_PASS(CastFilter, stmt);
if (!is_dynamic) {
stmt = NEXT_PASS(SplitTail, stmt);
}
stmt = NEXT_PASS(EmitInsn, stmt, global_attrs.GetBoolAttr(kEnableBisectOptimize, true),
global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true), binds_0, is_dynamic);
// must be after EmitInsn
stmt = NEXT_PASS(TileCoverCorrect, stmt);
if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) {
// simulated blocks > 2 400 000 => simulated case takes too much time (> 100 sec)
// number of protections > 512 => too many brackets in the if statement throw an error
stmt = NEXT_PASS(CoverProtection, stmt, 2400000, 512);
}
stmt = NEXT_PASS(ConvertDivModToShift, stmt);
if (!polyhedral || global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) {
// for conv manual schedule and load3d
stmt = NEXT_PASS(CoarsenImg2Col, stmt);
}
stmt = NEXT_PASS(DTypeAdapter, stmt);
if (global_attrs.GetBoolAttr(kEnableHoistInsn, true)) {
stmt = NEXT_PASS(HoistInsn, stmt);
}
// temp disable InvariantHoist for dynamic shape because it may move LetStmt out of scope
if (global_attrs.GetBoolAttr(kEnableInvariantHoist, true)) {
stmt = NEXT_PASS(InvariantHoist, stmt);
}
stmt = NEXT_PASS(SetVectorMaskDefault, stmt);
stmt = NEXT_PASS(ElimVectorMask, stmt);
stmt = NEXT_PASS(ElimDMA, stmt);
if (!is_dynamic) {
stmt = NEXT_PASS(MultiCorePartition, stmt);
}
if (global_attrs.GetBoolAttr(kEnableDoubleBuffer, true)) {
stmt = NEXT_PASS(AutoDoubleBuffer, stmt);
}
stmt = NEXT_PASS(InjectAccessPtrMSG, stmt);
if (!aicpu) {
stmt = NEXT_PASS(InjectPipe, stmt);
}
stmt = NEXT_PASS(ModDivEliminate, stmt);
if (global_attrs.GetBoolAttr(kEnableDoubleBuffer, true)) {
stmt = NEXT_PASS(AutoDoubleBuffer, stmt);
}
stmt = NEXT_PASS(InjectAccessPtrMSG, stmt);
if (!aicpu) {
stmt = NEXT_PASS(InjectPipe, stmt);
}
stmt = NEXT_PASS(ModDivEliminate, stmt);
// Phase 2
if (!simple_mode && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true) && !is_dynamic) {
stmt = NEXT_PASS(LoopPartitionCCE, stmt, config->partition_const_loop, true, !polyhedral);
}
if (global_attrs.GetBoolAttr(kEnablePreStorageWriteSimplify, false)) {
stmt = NEXT_PASS(AlgebraSimplify, stmt);
// Phase 2
if (!simple_mode && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true) && !is_dynamic) {
stmt = NEXT_PASS(LoopPartitionCCE, stmt, config->partition_const_loop, true, !polyhedral);
}
if (global_attrs.GetBoolAttr(kEnablePreStorageWriteSimplify, false)) {
stmt = NEXT_PASS(AlgebraSimplify, stmt);
}
std::string maxsat_filename = global_attrs.GetStringAttr(kMaxsatFile, std::string());
// attempt to optimize UB memory layout to reduce bank conflicts and pipeline conflicts
bool use_bc_opt = global_attrs.GetBoolAttr(kUseBcOpt, true);
// run MaxSAT solver for bank conflicts with no limits on model size or runtime
bool bc_no_limits = false;
// timeout for MaxSAT solver in seconds (int)
int maxsat_timeout = 4;
try {
stmt = NEXT_PASS(StorageRewriteCCE, stmt, maxsat_filename, use_bc_opt, bc_no_limits, maxsat_timeout);
} catch (MemoryAllocationException &e) {
if (enter_count >= max_enter_poly_times) {
CHECK(false) << e.what();
}
global_attrs.Set(kAllocBits, ktvm::make_const(Int(32), e.alloc_bits_ + e.need_bits_));
global_attrs.Set(kErrorScope, StringImm::make(e.scope_));
continue;
}
break;
}
std::string maxsat_filename = global_attrs.GetStringAttr(kMaxsatFile, std::string());
// attempt to optimize UB memory layout to reduce bank conflicts and pipeline conflicts
bool use_bc_opt = global_attrs.GetBoolAttr(kUseBcOpt, true);
// run MaxSAT solver for bank conflicts with no limits on model size or runtime
bool bc_no_limits = false;
// timeout for MaxSAT solver in seconds (int)
int maxsat_timeout = 4;
stmt = NEXT_PASS(StorageRewriteCCE, stmt, maxsat_filename, use_bc_opt, bc_no_limits, maxsat_timeout);
if (!is_dynamic)
stmt = NEXT_PASS(UnrollLoop, stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth,
......
......@@ -98,7 +98,13 @@ int AttrMap::GetIntAttr(const std::string &attr_name, int dft_value) {
const NodeRef &e = this->at(attr_name);
return ir::GetInt32Const(Downcast<Expr>(e));
}
double AttrMap::GetFloatAttr(const std::string &attr_name, double dft_value) {
if (this->count(attr_name) == 0) {
return dft_value;
}
const NodeRef &e = this->at(attr_name);
return ir::GetFloatConst(Downcast<Expr>(e));
}
bool AttrMap::GetBoolAttr(const std::string &attr_name, bool dft_value) {
int result = GetIntAttr(attr_name, static_cast<int>(dft_value));
CHECK(result == 0 || result == 1) << "Bool attribute " << attr_name << " must be 0 or 1, but found "
......
......@@ -91,6 +91,11 @@ constexpr auto kEnableRemoveBroadcastCopy = "enable_remove_broadcast_copy";
constexpr auto kEnableSubstituteDivVar = "enable_divide_var";
constexpr auto kEnableComputeInPlace = "enable_compute_in_place";
constexpr auto kEnableRewriteScalarCompute = "enable_rewrite_scalar_compute";
constexpr auto kMaxNumRetryPoly = "max_num_retry_poly";
constexpr auto kUBRatio = "ub_ratio";
constexpr auto kErrorInfo = "";
constexpr auto kErrorScope = "";
constexpr auto kAllocBits = "alloc_bits";
static std::unordered_map<std::string, int> help_tiling_level = {
{"None", 0},
......@@ -109,7 +114,7 @@ class AttrMap : public Map<std::string, NodeRef> {
bool GetBoolAttr(const std::string &attr_name, bool dft_value);
int GetIntAttr(const std::string &attr_name, int dft_value);
double GetFloatAttr(const std::string &attr_name, double dft_value);
bool GetStringAttr(const std::string &attr_name, std::string *attr_to_set);
std::string GetStringAttr(const std::string &attr_name, const std::string &dft_value);
};
......
......@@ -18,11 +18,33 @@
#define INCLUDE_AKG_BUILD_MODULE_H_
#include <string>
#include <exception>
#include "codegen/util.h"
namespace akg {
extern AttrMap global_attrs;
/*
* Custom exception used when memory allocation fails and triggers micro-tuning to try to recover from failure.
*/
class MemoryAllocationException : public std::exception {
public:
MemoryAllocationException(const std::string &scope, uint64_t need_bits, uint64_t alloc_bits)
: scope_(scope), need_bits_(need_bits), alloc_bits_(alloc_bits){};
const char *what() const throw() {
std::runtime_error re(("Allocation exceed bound of memory tag " + scope_ + ": need " + std::to_string(need_bits_) +
" bits, total alloc " + std::to_string(alloc_bits_) + " bits.")
.c_str());
return re.what();
}
std::string scope_{""};
uint64_t need_bits_{0};
uint64_t alloc_bits_{0};
};
NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef> &shape_vars, const std::string &name,
const Map<Tensor, Buffer> &in_binds, const Map<std::string, NodeRef> &in_attrs, bool simple_mode,
bool polyhedral, bool tuning, bool aicpu, const BuildConfig &config);
......
......@@ -26,6 +26,7 @@
#include <regex>
#include "ir_pass.h"
#include "build_module.h"
#include "pass/ir_util.h"
#include "emit_insn/insn_info.h"
#include "pass/storage_rewrite_cce.h"
......@@ -1146,8 +1147,7 @@ bool StoragePlanRewriterCCE::DoRewrite(const std::string scope, std::vector<std:
}
if (spec_level <= 0 || child_idx < 0) {
if (!is_dynamic_) {
LOG(FATAL) << "Allocation exceed bound of memory tag " << scope << ": need " << need_nbits
<< " bits, total alloc " << total_alloc_bits << " bits";
throw MemoryAllocationException(scope, need_nbits, total_alloc_bits);
} else {
LOG(WARNING) << "Dynamic shape static allocation exceed bound of memory tag " << scope << ": need "
<< need_nbits << " bits, will use dynamic allocation instead";
......
......@@ -16,11 +16,63 @@
*/
#include "poly/tiling_solver.h"
#include "build_module.h"
namespace akg {
namespace ir {
namespace poly {
/*
* This function parse StorageFlatten error info into a ratio that guides the auto tiling to reduce
* memory allocation.
* e.g.
* error info : Check failed: const_size * op->type.bits() <= info->max_num_bits (5242880 vs. 2097152) :
* Allocation exceed bound of memory tag local.UB.
* ratio : memory_size / alloc_size = (2097152 / 5242880) = 0.4, which means the total allocation
* size used in auto tiling shoulde reduce 0.4 times.
*/
double TilingSolver::GetNewAllocRatioWhenFlattenFail(const std::string &error_info) {
std::vector<std::string> sub_strs;
sub_strs = akg::common::Split(error_info, "(");
CHECK_GE(sub_strs.size(), 2U);
std::string tmp_str = sub_strs[2];
sub_strs = akg::common::Split(tmp_str, " ");
CHECK(!sub_strs.empty());
auto alloc_bits = static_cast<double>(std::strtod(sub_strs[0].c_str(), nullptr));
sub_strs = akg::common::Split(error_info, ")");
CHECK_GE(sub_strs.size(), 1U);
tmp_str = sub_strs[1];
sub_strs = akg::common::Split(tmp_str, " ");
CHECK(!sub_strs.empty());
auto memory_bits = static_cast<double>(std::strtod(sub_strs.back().c_str(), nullptr));
CHECK_NE(alloc_bits, 0);
return memory_bits / alloc_bits;
}
/*
* This function returns an adjust ratio that further reduces the memory allocation limit apart from
* the default percentage reserved for auto double buffer and try to generate smaller tile sizes that
* helps to recover from memory allocation failure such as the one in storage rewrite cce pass.
*/
double TilingSolver::GetNewAllocRatioWhenRewriteFail(int64_t memory_bits) {
auto actual_allocs = global_attrs.GetFloatAttr(kAllocBits, 0.0);
auto last_adjust_ratio = global_attrs.GetFloatAttr(kUBRatio, 1.0);
auto adjust_ratio = 1.0;
if (actual_allocs != 0) {
std::stringstream ss;
auto expect_allocs = memory_bits * last_adjust_ratio;
adjust_ratio = (expect_allocs / actual_allocs);
ss << "Adjust memory allocation ratio to " << adjust_ratio << " times and retry tiling.";
global_attrs.Set(kUBRatio, ktvm::make_const(Float(32), adjust_ratio));
analyzer_.logger_.AppendLog(MICRO_TUNING, ss);
}
return adjust_ratio;
}
void TilingSolver::CollectMemoryLimit() {
// Init memory allocation percentage.
percentage_ = ALLOCATION_PERCENTAGE;
for (auto attr : analyzer_.RootAxis()->attrs) {
if (attr.attr_key != "MEM_RATIO") continue;
......@@ -29,9 +81,27 @@ void TilingSolver::CollectMemoryLimit() {
break;
}
// Handle previous error info if storage flatten fails and adjust allocation percentage.
auto error_info = global_attrs.GetStringAttr(kErrorInfo, "");
if (!error_info.empty() && error_info.find("storage_flatten") != std::string::npos) {
std::stringstream ss;
ss << "Get Error Info! -> " << global_attrs.GetStringAttr(kErrorInfo, "");
percentage_ = percentage_ * GetNewAllocRatioWhenFlattenFail(error_info);
ss << "Adjust memory allocation to " << percentage_ << " of memory size and retry tiling.";
global_attrs.Set(kErrorInfo, StringImm::make(""));
analyzer_.logger_.AppendLog(MICRO_TUNING, ss);
}
// Init memory limit for each scope and reduce ratio of local.UB if storage rewrite fails previously.
DavinciInfo &d_info = DavinciInfo::GetInstance();
auto error_scope = global_attrs.GetStringAttr(kErrorScope, "");
for (auto i = 0; i < MEM_SCOPE_BULK; ++i) {
this->mem_limit_[i] = d_info.GetMemoryLimitInScope(i) * percentage_;
if (i == DavinciMemScope::MEM_SCOPE_UB && error_scope == "local.UB") {
this->mem_limit_[i] =
std::max(static_cast<int>(this->mem_limit_[i] * GetNewAllocRatioWhenRewriteFail(this->mem_limit_[i])), 1);
global_attrs.Set(kErrorScope, StringImm::make(""));
}
}
}
......
......@@ -30,6 +30,8 @@ class TilingSolver {
~TilingSolver() {}
void CollectMemoryLimit();
void CollectTileAxisTopDown();
double GetNewAllocRatioWhenFlattenFail(const std::string &error_info);
double GetNewAllocRatioWhenRewriteFail(int64_t memory_bits);
TileCandidate *Solve();
TilingAnalyzer &analyzer_;
......
......@@ -29,6 +29,8 @@ void TileLogger::AppendLine(LogStage stage, const std::string &line) {
analyze_tiling_space_stage_.emplace_back(line);
} else if (stage == DO_TILING) {
do_tiling_stage_.emplace_back(line);
} else if (stage == MICRO_TUNING) {
micro_tuning_strage_.emplace_back(line);
} else {
do_tuning_stage_.emplace_back(line);
}
......@@ -70,6 +72,11 @@ bool TileLogger::DumpLogFile() {
of << line << std::endl;
}
of << "=========================" << std::endl;
of << ">>>>>>>>>> Micro tuning stage <<<<<<<<<<<<" << std::endl;
for (const auto &line : micro_tuning_strage_) {
of << line << std::endl;
}
of << "=========================" << std::endl;
of.close();
return true;
}
......
......@@ -32,7 +32,7 @@ enum DavinciMemScope {
MEM_SCOPE_L0C,
MEM_SCOPE_BULK,
};
enum LogStage { ANA_SCHETREE, ANA_BUF_LIVE_EXTENT, ANA_TILING_SPACE, DO_TILING, DO_TUNING };
enum LogStage { ANA_SCHETREE, ANA_BUF_LIVE_EXTENT, ANA_TILING_SPACE, DO_TILING, DO_TUNING, MICRO_TUNING };
class DavinciInfo {
public:
......@@ -89,6 +89,7 @@ class TileLogger {
LogFile analyze_tiling_space_stage_;
LogFile do_tiling_stage_;
LogFile do_tuning_stage_;
LogFile micro_tuning_strage_;
};
} // namespace poly
} // namespace ir
......
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""unittest for micro-tuning"""
from akg.utils import kernel_exec
from akg.ops.array import four2five
def test_four2five_without_custom_tiling(build_shape, dtype, op_attrs):
"""This test case will fail without cunstom tiling and micro-tuning will automatically adjust tile sizes."""
build_attr = op_attrs + [False]
return kernel_exec.op_build_test(four2five.four2five, [build_shape], [dtype], build_attr, kernel_name="four2five", attrs={}, tuning=False)
if __name__ == "__main__":
test_four2five_without_custom_tiling(
[32, 1001, 1, 1], "float16", ['NCHW', 'float16'])
......@@ -22,6 +22,7 @@ casefiles=(
"pass/test_promote_if.py"
"pass/test_sink_if.py"
"pass/test_ir_parser.py"
"pass/test_micro_tuning.py"
"pass/test_elim_vector_mask.py"
"pass/test_copy_propagation.py"
"pass/test_utils_detect_non_linear_index.py"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册