From aaf7702176bd4fd0bcc76ef1a56bfd5c4288fab6 Mon Sep 17 00:00:00 2001 From: dabaiji Date: Tue, 14 Jul 2020 09:22:40 +0800 Subject: [PATCH] support dynamical memory allocation ratio adjustment in micro-tuning for allocation exceed problem --- python/akg/ops/array/four2five.py | 9 +- src/codegen/build_module.cc | 432 ++++++++++++----------- src/codegen/util.cc | 8 +- src/codegen/util.h | 7 +- src/include/build_module.h | 22 ++ src/pass/storage_rewrite_cce.cc | 4 +- src/poly/tiling_solver.cc | 72 +++- src/poly/tiling_solver.h | 2 + src/poly/tiling_utils.cc | 7 + src/poly/tiling_utils.h | 3 +- tests/unittest/pass/test_micro_tuning.py | 28 ++ tests/unittest/unittest.sh | 1 + 12 files changed, 383 insertions(+), 212 deletions(-) create mode 100644 tests/unittest/pass/test_micro_tuning.py diff --git a/python/akg/ops/array/four2five.py b/python/akg/ops/array/four2five.py index 7cb344f..fc643a6 100644 --- a/python/akg/ops/array/four2five.py +++ b/python/akg/ops/array/four2five.py @@ -116,8 +116,8 @@ def four2five_tiling_strategy_dynamic(tensor, input_format): strategy.append(ct_util.create_constraint_on_tensor(tensor, 16, ct_util.TileConstraint.FACTOR, 4)[0]) return strategy -@vc_util.check_input_type(akg.tvm.tensor.Tensor, str, str) -def four2five(data, format_, dst_dtype='float16'): +@vc_util.check_input_type(akg.tvm.tensor.Tensor, str, str, bool) +def four2five(data, format_, dst_dtype='float16', need_custom_tiling=True): """ Convert 4-dims "data" to 5-dims,the format of "data" is defined in "format_" @@ -294,8 +294,9 @@ def four2five(data, format_, dst_dtype='float16'): dim_info, _ = four2five_set_dim_func(data, format_, dst_dtype) if dim_info != "": attrs["dim"] = dim_info - attrs["custom_tiling"] = four2five_tiling_strategy(output, format_, expansion) - else: + if need_custom_tiling: + attrs["custom_tiling"] = four2five_tiling_strategy(output, format_, expansion) + elif need_custom_tiling: attrs["custom_tiling"] = four2five_tiling_strategy_dynamic(output, format_) if is_dynamic: diff --git a/src/codegen/build_module.cc b/src/codegen/build_module.cc index bd98bf1..3b8e82c 100644 --- a/src/codegen/build_module.cc +++ b/src/codegen/build_module.cc @@ -458,7 +458,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array PassTimer *pass_timer = PassTimer::GetInstance(); global_attrs.Set(kKernelName, StringImm::make(name)); - global_attrs.Set(kDumpPassIr, ktvm::make_const(Int(1), config->dump_pass_ir)); + global_attrs.Set(kDumpPassIr, ktvm::make_const(Int(32), config->dump_pass_ir)); if (config->dump_pass_ir) { std::string dump_ir_dir; if (global_attrs.GetStringAttr(kDumpIrDir, &dump_ir_dir)) { @@ -498,7 +498,7 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array stmt = NEXT_PASS(RenameRealize, stmt, binds_0, replace); bool is_dynamic = !shape_vars.empty(); - global_attrs.Set(kIsDynamic, ktvm::make_const(Int(1), is_dynamic)); + global_attrs.Set(kIsDynamic, ktvm::make_const(Int(32), is_dynamic)); Array arg_list_1; Map binds_1; @@ -594,227 +594,255 @@ NodeRef Lower(Schedule sch, const Array &in_args, const Array NodeRef tuning_spaces = NEXT_PASS(GenTuningSpace, stmt, binds_0, attrs_1, false); return tuning_spaces; } - Array poly_res = NEXT_PASS(AutoPoly, stmt, binds_0, global_attrs, false, is_dynamic); - CHECK_EQ(poly_res.size(), 2); - stmt = ktvm::Downcast(poly_res[0]); - Array tiling_params = ktvm::Downcast>(poly_res[1]); - for (const auto &var : tiling_params) { - arg_list_0.push_back(var); - } + } - if (global_attrs.GetBoolAttr(kTileSizeIsVar, false)) { - Array arg_list_2; - Map binds_2; - FixParametricBinds(binds_0, arg_list_0, config, &binds_2, &arg_list_2); - stmt = NEXT_PASS(FixBindBuffer, stmt, binds_2); - arg_list_0 = arg_list_2; - binds_0 = binds_2; - } + // micro-tuning configs: current strategy is to retry autopoly up to 3 times when storage flatten/rewrite fails + bool need_micro_tuning = !aicpu && polyhedral && !is_dynamic && global_attrs.GetStringAttr("dim", "").empty(); + const int max_enter_poly_times = global_attrs.GetIntAttr(kMaxNumRetryPoly, need_micro_tuning ? 4 : 1); + int enter_count = 0; + Stmt stmt_before_poly = stmt; + while (enter_count < max_enter_poly_times) { + if (!aicpu && polyhedral) { + Array poly_res = NEXT_PASS(AutoPoly, stmt_before_poly, binds_0, global_attrs, false, is_dynamic); + enter_count++; + CHECK_EQ(poly_res.size(), 2); + stmt = ktvm::Downcast(poly_res[0]); + Array tiling_params = ktvm::Downcast>(poly_res[1]); + for (const auto &var : tiling_params) { + arg_list_0.push_back(var); + } - if (is_dynamic) { - if (global_attrs.GetBoolAttr(kEnableSubstituteDivVar, false)) { - stmt = NEXT_PASS(SubstituteDivVar, stmt); + if (global_attrs.GetBoolAttr(kTileSizeIsVar, false)) { + Array arg_list_2; + Map binds_2; + FixParametricBinds(binds_0, arg_list_0, config, &binds_2, &arg_list_2); + stmt = NEXT_PASS(FixBindBuffer, stmt, binds_2); + arg_list_0 = arg_list_2; + binds_0 = binds_2; } - // fix var addresses because poly identify vars by name - stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0); - // isolate dynamic tile loops (isolate body and tail) - if (global_attrs.GetBoolAttr(kEnableIsolateLoop, true)) { - stmt = NEXT_PASS(IsolateLoops, stmt, global_attrs.GetBoolAttr(kEnableIsolateMinMax, false)); - stmt = NEXT_PASS(PromoteLetStmt, stmt, arg_list_0); + if (is_dynamic) { + if (global_attrs.GetBoolAttr(kEnableSubstituteDivVar, false)) { + stmt = NEXT_PASS(SubstituteDivVar, stmt); + } + + // fix var addresses because poly identify vars by name + stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0); + // isolate dynamic tile loops (isolate body and tail) + if (global_attrs.GetBoolAttr(kEnableIsolateLoop, true)) { + stmt = NEXT_PASS(IsolateLoops, stmt, global_attrs.GetBoolAttr(kEnableIsolateMinMax, false)); + stmt = NEXT_PASS(PromoteLetStmt, stmt, arg_list_0); + } } - } - // pls do not insert pass between AutoPoly and cube special pass. - // cube special pass begin - stmt = NEXT_PASS(ExprPatternRewrite, stmt); - stmt = NEXT_PASS(AutoMadPragmaAttr, stmt); - stmt = NEXT_PASS(PostFusion, stmt, binds_0, is_dynamic); - stmt = NEXT_PASS(ReduceFusionOpt, stmt, binds_0); - stmt = NEXT_PASS(PostProcessImg2col, stmt); - stmt = NEXT_PASS(PromoteIfStmt, stmt, is_dynamic); - stmt = NEXT_PASS(BypassL1, stmt); - if (global_attrs.GetBoolAttr(kEnableStrideKernelOp, true)) { - stmt = NEXT_PASS(StrideKernelOp, stmt, binds_0, is_dynamic); - } - stmt = NEXT_PASS(Load3dTrans, stmt, is_dynamic); - // cube special pass end - stmt = NEXT_PASS(CopyPropagation, stmt, binds_0); - stmt = NEXT_PASS(ConvertCondToExtent, stmt); - bool enable_convert_if = global_attrs.GetBoolAttr(kEnableConvertIf, false); - if (enable_convert_if) { - stmt = NEXT_PASS(FixRealizeShape, stmt); - } - if (global_attrs.GetBoolAttr(kEnableDmaSink, false)) { - stmt = NEXT_PASS(DMASink, stmt); - } + // pls do not insert pass between AutoPoly and cube special pass. + // cube special pass begin + stmt = NEXT_PASS(ExprPatternRewrite, stmt); + stmt = NEXT_PASS(AutoMadPragmaAttr, stmt); + stmt = NEXT_PASS(PostFusion, stmt, binds_0, is_dynamic); + stmt = NEXT_PASS(ReduceFusionOpt, stmt, binds_0); + stmt = NEXT_PASS(PostProcessImg2col, stmt); + stmt = NEXT_PASS(PromoteIfStmt, stmt, is_dynamic); + stmt = NEXT_PASS(BypassL1, stmt); + if (global_attrs.GetBoolAttr(kEnableStrideKernelOp, true)) { + stmt = NEXT_PASS(StrideKernelOp, stmt, binds_0, is_dynamic); + } + stmt = NEXT_PASS(Load3dTrans, stmt, is_dynamic); + // cube special pass end + stmt = NEXT_PASS(CopyPropagation, stmt, binds_0); + stmt = NEXT_PASS(ConvertCondToExtent, stmt); + bool enable_convert_if = global_attrs.GetBoolAttr(kEnableConvertIf, false); + if (enable_convert_if) { + stmt = NEXT_PASS(FixRealizeShape, stmt); + } + if (global_attrs.GetBoolAttr(kEnableDmaSink, false)) { + stmt = NEXT_PASS(DMASink, stmt); + } - stmt = NEXT_PASS(LowerWith, stmt); - stmt = NEXT_PASS(ForEliminate, stmt); - stmt = NEXT_PASS(RealizeCompress, stmt); + stmt = NEXT_PASS(LowerWith, stmt); + stmt = NEXT_PASS(ForEliminate, stmt); + stmt = NEXT_PASS(RealizeCompress, stmt); - if (!global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) { - stmt = NEXT_PASS(LoopNormlize, stmt); + if (!global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) { + stmt = NEXT_PASS(LoopNormlize, stmt); + } + stmt = NEXT_PASS(PoolingTransform, stmt, is_dynamic); + stmt = NEXT_PASS(InjectAttr, stmt); + stmt = NEXT_PASS(ModDivEliminate, stmt); + if (enable_convert_if) { + stmt = NEXT_PASS(AlignLastAxisLoopExtent, stmt, binds_0); + stmt = NEXT_PASS(FixLoopExtent, stmt); + stmt = NEXT_PASS(ConvertIfToSelect, stmt); + } } - stmt = NEXT_PASS(PoolingTransform, stmt, is_dynamic); - stmt = NEXT_PASS(InjectAttr, stmt); - stmt = NEXT_PASS(ModDivEliminate, stmt); - if (enable_convert_if) { - stmt = NEXT_PASS(AlignLastAxisLoopExtent, stmt, binds_0); - stmt = NEXT_PASS(FixLoopExtent, stmt); - stmt = NEXT_PASS(ConvertIfToSelect, stmt); + try { + stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64); + } catch (const std::runtime_error &e) { + if (enter_count >= max_enter_poly_times) { + CHECK(false) << e.what(); + } + global_attrs.Set(kErrorInfo, StringImm::make(e.what())); + continue; + } + stmt = NEXT_PASS(DmaFlatten, stmt, global_attrs.GetBoolAttr(kTileSizeIsVar, false)); + if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) { + stmt = NEXT_PASS(AlgebraSimplify, stmt); + } + if (is_dynamic) { + stmt = NEXT_PASS(UnifyAllocate, stmt); } - } - - stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64); - stmt = NEXT_PASS(DmaFlatten, stmt, global_attrs.GetBoolAttr(kTileSizeIsVar, false)); - if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) { - stmt = NEXT_PASS(AlgebraSimplify, stmt); - } - if (is_dynamic) { - stmt = NEXT_PASS(UnifyAllocate, stmt); - } - if (global_attrs.GetBoolAttr(kEleminateOutmostForCond, false)) { - stmt = NEXT_PASS(PreProcess4Multicore, stmt); - } + if (global_attrs.GetBoolAttr(kEleminateOutmostForCond, false)) { + stmt = NEXT_PASS(PreProcess4Multicore, stmt); + } - int enable_multicore = global_attrs.GetIntAttr(kEnableMulticore, 1); - if (!is_dynamic && enable_multicore != 0 && global_attrs.GetBoolAttr(kMultiCoreLoopSwitchHoist, true)) { - stmt = NEXT_PASS(MultiCoreLoopSwitchHoist, stmt); - } - stmt = NEXT_PASS(LoopSwitchHoist, stmt, global_attrs.GetIntAttr(kEnableHoistAllocate, false)); + int enable_multicore = global_attrs.GetIntAttr(kEnableMulticore, 1); + if (!is_dynamic && enable_multicore != 0 && global_attrs.GetBoolAttr(kMultiCoreLoopSwitchHoist, true)) { + stmt = NEXT_PASS(MultiCoreLoopSwitchHoist, stmt); + } + stmt = NEXT_PASS(LoopSwitchHoist, stmt, global_attrs.GetIntAttr(kEnableHoistAllocate, false)); - // Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning, - // 4 : whether to partition convolution or not - if (!aicpu && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) { - stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral); - } + // Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning, + // 4 : whether to partition convolution or not + if (!aicpu && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) { + stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral); + } - if (polyhedral && global_attrs.GetBoolAttr(kEnableSinkAllocate, true)) { - stmt = NEXT_PASS(SinkAllocate, stmt); - } + if (polyhedral && global_attrs.GetBoolAttr(kEnableSinkAllocate, true)) { + stmt = NEXT_PASS(SinkAllocate, stmt); + } - if (global_attrs.GetBoolAttr(kLoopPartitionUnroll, false)) { - // For the Manual scheduling or When polyhedral is not used - stmt = NEXT_PASS(UnrollNonConstantExtent, stmt); - } - if (!polyhedral) { - // fix mad attributes and remove dead computations for the manual schedule - stmt = NEXT_PASS(FixMadAttrs, stmt); - } - if (!is_dynamic) { - stmt = NEXT_PASS(CanonicalSimplify, stmt); - } - stmt = NEXT_PASS(ForEliminate, stmt); - if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) { - stmt = NEXT_PASS(AlgebraSimplify, stmt); - } - if (!is_dynamic) { - stmt = NEXT_PASS(FixLoopExtent, stmt); - } + if (global_attrs.GetBoolAttr(kLoopPartitionUnroll, false)) { + // For the Manual scheduling or When polyhedral is not used + stmt = NEXT_PASS(UnrollNonConstantExtent, stmt); + } + if (!polyhedral) { + // fix mad attributes and remove dead computations for the manual schedule + stmt = NEXT_PASS(FixMadAttrs, stmt); + } + if (!is_dynamic) { + stmt = NEXT_PASS(CanonicalSimplify, stmt); + } + stmt = NEXT_PASS(ForEliminate, stmt); + if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) { + stmt = NEXT_PASS(AlgebraSimplify, stmt); + } + if (!is_dynamic) { + stmt = NEXT_PASS(FixLoopExtent, stmt); + } - if (!aicpu) { - stmt = NEXT_PASS(AutoPragma, stmt); - } - stmt = NEXT_PASS(EliminateAtomicDma, stmt); - if (global_attrs.GetBoolAttr(kDeadCodeElim, false)) { - stmt = NEXT_PASS(DeadCodeElim, stmt); - } - if (!is_dynamic) { - stmt = NEXT_PASS(RewriteBroadcastVector, stmt); - stmt = NEXT_PASS(OptimizePragma, stmt); - } - if (is_dynamic) { - stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true), - global_attrs.GetIntAttr(kEnableScalarAlign, false)); - } else { - stmt = NEXT_PASS(AnalyzeMinAlignStatic, stmt); - } - stmt = NEXT_PASS(MultiLastAxisReductions, stmt, is_dynamic); - stmt = NEXT_PASS(AutoReorder, stmt); - if (enable_multicore != 0) { - if (is_dynamic && enable_multicore == 1) { - Var block_dim = Variable::make(Int(32), "blockDim"); - Array multicore_res = - NEXT_PASS(InjectMultiCoreVar, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0)); - CHECK_EQ(multicore_res.size(), 2); - stmt = ktvm::Downcast(multicore_res[0]); - auto extent_thread = ktvm::Downcast(multicore_res[1]); - if (extent_thread.as()->value == -1) { - arg_list_0.push_back(block_dim); - } + if (!aicpu) { + stmt = NEXT_PASS(AutoPragma, stmt); + } + stmt = NEXT_PASS(EliminateAtomicDma, stmt); + if (global_attrs.GetBoolAttr(kDeadCodeElim, false)) { + stmt = NEXT_PASS(DeadCodeElim, stmt); + } + if (!is_dynamic) { + stmt = NEXT_PASS(RewriteBroadcastVector, stmt); + stmt = NEXT_PASS(OptimizePragma, stmt); + } + if (is_dynamic) { + stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true), + global_attrs.GetIntAttr(kEnableScalarAlign, false)); } else { - int block_dim = enable_multicore == 1 ? -1 : enable_multicore; - stmt = NEXT_PASS(InjectMultiCore, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0), is_dynamic, - global_attrs.GetBoolAttr(kMultiCoreScalarRerrange, false)); + stmt = NEXT_PASS(AnalyzeMinAlignStatic, stmt); + } + stmt = NEXT_PASS(MultiLastAxisReductions, stmt, is_dynamic); + stmt = NEXT_PASS(AutoReorder, stmt); + if (enable_multicore != 0) { + if (is_dynamic && enable_multicore == 1) { + Var block_dim = Variable::make(Int(32), "blockDim"); + Array multicore_res = + NEXT_PASS(InjectMultiCoreVar, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0)); + CHECK_EQ(multicore_res.size(), 2); + stmt = ktvm::Downcast(multicore_res[0]); + auto extent_thread = ktvm::Downcast(multicore_res[1]); + if (extent_thread.as()->value == -1) { + arg_list_0.push_back(block_dim); + } + } else { + int block_dim = enable_multicore == 1 ? -1 : enable_multicore; + stmt = NEXT_PASS(InjectMultiCore, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0), is_dynamic, + global_attrs.GetBoolAttr(kMultiCoreScalarRerrange, false)); + } + } + if (!is_dynamic) { + RecordCore(stmt, global_attrs.GetBoolAttr(kRecordCore, false)); + } + stmt = NEXT_PASS(SelectLower, stmt); + stmt = NEXT_PASS(ReplaceFargmaxCasts, stmt); + if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) { + stmt = NEXT_PASS(GatherLoopInfo, stmt); + } + stmt = NEXT_PASS(CastFilter, stmt); + if (!is_dynamic) { + stmt = NEXT_PASS(SplitTail, stmt); + } + stmt = NEXT_PASS(EmitInsn, stmt, global_attrs.GetBoolAttr(kEnableBisectOptimize, true), + global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true), binds_0, is_dynamic); + // must be after EmitInsn + stmt = NEXT_PASS(TileCoverCorrect, stmt); + if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) { + // simulated blocks > 2 400 000 => simulated case takes too much time (> 100 sec) + // number of protections > 512 => too many brackets in the if statement throw an error + stmt = NEXT_PASS(CoverProtection, stmt, 2400000, 512); + } + stmt = NEXT_PASS(ConvertDivModToShift, stmt); + if (!polyhedral || global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) { + // for conv manual schedule and load3d + stmt = NEXT_PASS(CoarsenImg2Col, stmt); + } + stmt = NEXT_PASS(DTypeAdapter, stmt); + if (global_attrs.GetBoolAttr(kEnableHoistInsn, true)) { + stmt = NEXT_PASS(HoistInsn, stmt); + } + // temp disable InvariantHoist for dynamic shape because it may move LetStmt out of scope + if (global_attrs.GetBoolAttr(kEnableInvariantHoist, true)) { + stmt = NEXT_PASS(InvariantHoist, stmt); + } + stmt = NEXT_PASS(SetVectorMaskDefault, stmt); + stmt = NEXT_PASS(ElimVectorMask, stmt); + stmt = NEXT_PASS(ElimDMA, stmt); + if (!is_dynamic) { + stmt = NEXT_PASS(MultiCorePartition, stmt); } - } - if (!is_dynamic) { - RecordCore(stmt, global_attrs.GetBoolAttr(kRecordCore, false)); - } - stmt = NEXT_PASS(SelectLower, stmt); - stmt = NEXT_PASS(ReplaceFargmaxCasts, stmt); - if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) { - stmt = NEXT_PASS(GatherLoopInfo, stmt); - } - stmt = NEXT_PASS(CastFilter, stmt); - if (!is_dynamic) { - stmt = NEXT_PASS(SplitTail, stmt); - } - stmt = NEXT_PASS(EmitInsn, stmt, global_attrs.GetBoolAttr(kEnableBisectOptimize, true), - global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true), binds_0, is_dynamic); - // must be after EmitInsn - stmt = NEXT_PASS(TileCoverCorrect, stmt); - if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) { - // simulated blocks > 2 400 000 => simulated case takes too much time (> 100 sec) - // number of protections > 512 => too many brackets in the if statement throw an error - stmt = NEXT_PASS(CoverProtection, stmt, 2400000, 512); - } - stmt = NEXT_PASS(ConvertDivModToShift, stmt); - if (!polyhedral || global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) { - // for conv manual schedule and load3d - stmt = NEXT_PASS(CoarsenImg2Col, stmt); - } - stmt = NEXT_PASS(DTypeAdapter, stmt); - if (global_attrs.GetBoolAttr(kEnableHoistInsn, true)) { - stmt = NEXT_PASS(HoistInsn, stmt); - } - // temp disable InvariantHoist for dynamic shape because it may move LetStmt out of scope - if (global_attrs.GetBoolAttr(kEnableInvariantHoist, true)) { - stmt = NEXT_PASS(InvariantHoist, stmt); - } - stmt = NEXT_PASS(SetVectorMaskDefault, stmt); - stmt = NEXT_PASS(ElimVectorMask, stmt); - stmt = NEXT_PASS(ElimDMA, stmt); - if (!is_dynamic) { - stmt = NEXT_PASS(MultiCorePartition, stmt); - } - if (global_attrs.GetBoolAttr(kEnableDoubleBuffer, true)) { - stmt = NEXT_PASS(AutoDoubleBuffer, stmt); - } - stmt = NEXT_PASS(InjectAccessPtrMSG, stmt); - if (!aicpu) { - stmt = NEXT_PASS(InjectPipe, stmt); - } - stmt = NEXT_PASS(ModDivEliminate, stmt); + if (global_attrs.GetBoolAttr(kEnableDoubleBuffer, true)) { + stmt = NEXT_PASS(AutoDoubleBuffer, stmt); + } + stmt = NEXT_PASS(InjectAccessPtrMSG, stmt); + if (!aicpu) { + stmt = NEXT_PASS(InjectPipe, stmt); + } + stmt = NEXT_PASS(ModDivEliminate, stmt); - // Phase 2 - if (!simple_mode && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true) && !is_dynamic) { - stmt = NEXT_PASS(LoopPartitionCCE, stmt, config->partition_const_loop, true, !polyhedral); - } - if (global_attrs.GetBoolAttr(kEnablePreStorageWriteSimplify, false)) { - stmt = NEXT_PASS(AlgebraSimplify, stmt); + // Phase 2 + if (!simple_mode && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true) && !is_dynamic) { + stmt = NEXT_PASS(LoopPartitionCCE, stmt, config->partition_const_loop, true, !polyhedral); + } + if (global_attrs.GetBoolAttr(kEnablePreStorageWriteSimplify, false)) { + stmt = NEXT_PASS(AlgebraSimplify, stmt); + } + std::string maxsat_filename = global_attrs.GetStringAttr(kMaxsatFile, std::string()); + // attempt to optimize UB memory layout to reduce bank conflicts and pipeline conflicts + bool use_bc_opt = global_attrs.GetBoolAttr(kUseBcOpt, true); + // run MaxSAT solver for bank conflicts with no limits on model size or runtime + bool bc_no_limits = false; + // timeout for MaxSAT solver in seconds (int) + int maxsat_timeout = 4; + try { + stmt = NEXT_PASS(StorageRewriteCCE, stmt, maxsat_filename, use_bc_opt, bc_no_limits, maxsat_timeout); + } catch (MemoryAllocationException &e) { + if (enter_count >= max_enter_poly_times) { + CHECK(false) << e.what(); + } + global_attrs.Set(kAllocBits, ktvm::make_const(Int(32), e.alloc_bits_ + e.need_bits_)); + global_attrs.Set(kErrorScope, StringImm::make(e.scope_)); + continue; + } + break; } - std::string maxsat_filename = global_attrs.GetStringAttr(kMaxsatFile, std::string()); - // attempt to optimize UB memory layout to reduce bank conflicts and pipeline conflicts - bool use_bc_opt = global_attrs.GetBoolAttr(kUseBcOpt, true); - // run MaxSAT solver for bank conflicts with no limits on model size or runtime - bool bc_no_limits = false; - // timeout for MaxSAT solver in seconds (int) - int maxsat_timeout = 4; - stmt = NEXT_PASS(StorageRewriteCCE, stmt, maxsat_filename, use_bc_opt, bc_no_limits, maxsat_timeout); if (!is_dynamic) stmt = NEXT_PASS(UnrollLoop, stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth, diff --git a/src/codegen/util.cc b/src/codegen/util.cc index cfbb462..df113bd 100644 --- a/src/codegen/util.cc +++ b/src/codegen/util.cc @@ -98,7 +98,13 @@ int AttrMap::GetIntAttr(const std::string &attr_name, int dft_value) { const NodeRef &e = this->at(attr_name); return ir::GetInt32Const(Downcast(e)); } - +double AttrMap::GetFloatAttr(const std::string &attr_name, double dft_value) { + if (this->count(attr_name) == 0) { + return dft_value; + } + const NodeRef &e = this->at(attr_name); + return ir::GetFloatConst(Downcast(e)); +} bool AttrMap::GetBoolAttr(const std::string &attr_name, bool dft_value) { int result = GetIntAttr(attr_name, static_cast(dft_value)); CHECK(result == 0 || result == 1) << "Bool attribute " << attr_name << " must be 0 or 1, but found " diff --git a/src/codegen/util.h b/src/codegen/util.h index eeb3b97..3639f99 100644 --- a/src/codegen/util.h +++ b/src/codegen/util.h @@ -91,6 +91,11 @@ constexpr auto kEnableRemoveBroadcastCopy = "enable_remove_broadcast_copy"; constexpr auto kEnableSubstituteDivVar = "enable_divide_var"; constexpr auto kEnableComputeInPlace = "enable_compute_in_place"; constexpr auto kEnableRewriteScalarCompute = "enable_rewrite_scalar_compute"; +constexpr auto kMaxNumRetryPoly = "max_num_retry_poly"; +constexpr auto kUBRatio = "ub_ratio"; +constexpr auto kErrorInfo = ""; +constexpr auto kErrorScope = ""; +constexpr auto kAllocBits = "alloc_bits"; static std::unordered_map help_tiling_level = { {"None", 0}, @@ -109,7 +114,7 @@ class AttrMap : public Map { bool GetBoolAttr(const std::string &attr_name, bool dft_value); int GetIntAttr(const std::string &attr_name, int dft_value); - + double GetFloatAttr(const std::string &attr_name, double dft_value); bool GetStringAttr(const std::string &attr_name, std::string *attr_to_set); std::string GetStringAttr(const std::string &attr_name, const std::string &dft_value); }; diff --git a/src/include/build_module.h b/src/include/build_module.h index ebc3211..d706b33 100644 --- a/src/include/build_module.h +++ b/src/include/build_module.h @@ -18,11 +18,33 @@ #define INCLUDE_AKG_BUILD_MODULE_H_ #include +#include #include "codegen/util.h" namespace akg { extern AttrMap global_attrs; + +/* + * Custom exception used when memory allocation fails and triggers micro-tuning to try to recover from failure. + */ +class MemoryAllocationException : public std::exception { + public: + MemoryAllocationException(const std::string &scope, uint64_t need_bits, uint64_t alloc_bits) + : scope_(scope), need_bits_(need_bits), alloc_bits_(alloc_bits){}; + + const char *what() const throw() { + std::runtime_error re(("Allocation exceed bound of memory tag " + scope_ + ": need " + std::to_string(need_bits_) + + " bits, total alloc " + std::to_string(alloc_bits_) + " bits.") + .c_str()); + return re.what(); + } + + std::string scope_{""}; + uint64_t need_bits_{0}; + uint64_t alloc_bits_{0}; +}; + NodeRef Lower(Schedule sch, const Array &in_args, const Array &shape_vars, const std::string &name, const Map &in_binds, const Map &in_attrs, bool simple_mode, bool polyhedral, bool tuning, bool aicpu, const BuildConfig &config); diff --git a/src/pass/storage_rewrite_cce.cc b/src/pass/storage_rewrite_cce.cc index 47df602..a11042b 100644 --- a/src/pass/storage_rewrite_cce.cc +++ b/src/pass/storage_rewrite_cce.cc @@ -26,6 +26,7 @@ #include #include "ir_pass.h" +#include "build_module.h" #include "pass/ir_util.h" #include "emit_insn/insn_info.h" #include "pass/storage_rewrite_cce.h" @@ -1146,8 +1147,7 @@ bool StoragePlanRewriterCCE::DoRewrite(const std::string scope, std::vectortype.bits() <= info->max_num_bits (5242880 vs. 2097152) : + * Allocation exceed bound of memory tag local.UB. + * ratio : memory_size / alloc_size = (2097152 / 5242880) = 0.4, which means the total allocation + * size used in auto tiling shoulde reduce 0.4 times. + */ +double TilingSolver::GetNewAllocRatioWhenFlattenFail(const std::string &error_info) { + std::vector sub_strs; + sub_strs = akg::common::Split(error_info, "("); + CHECK_GE(sub_strs.size(), 2U); + std::string tmp_str = sub_strs[2]; + sub_strs = akg::common::Split(tmp_str, " "); + CHECK(!sub_strs.empty()); + auto alloc_bits = static_cast(std::strtod(sub_strs[0].c_str(), nullptr)); + + sub_strs = akg::common::Split(error_info, ")"); + CHECK_GE(sub_strs.size(), 1U); + tmp_str = sub_strs[1]; + sub_strs = akg::common::Split(tmp_str, " "); + CHECK(!sub_strs.empty()); + auto memory_bits = static_cast(std::strtod(sub_strs.back().c_str(), nullptr)); + + CHECK_NE(alloc_bits, 0); + return memory_bits / alloc_bits; +} + +/* + * This function returns an adjust ratio that further reduces the memory allocation limit apart from + * the default percentage reserved for auto double buffer and try to generate smaller tile sizes that + * helps to recover from memory allocation failure such as the one in storage rewrite cce pass. + */ +double TilingSolver::GetNewAllocRatioWhenRewriteFail(int64_t memory_bits) { + auto actual_allocs = global_attrs.GetFloatAttr(kAllocBits, 0.0); + auto last_adjust_ratio = global_attrs.GetFloatAttr(kUBRatio, 1.0); + auto adjust_ratio = 1.0; + + if (actual_allocs != 0) { + std::stringstream ss; + auto expect_allocs = memory_bits * last_adjust_ratio; + adjust_ratio = (expect_allocs / actual_allocs); + ss << "Adjust memory allocation ratio to " << adjust_ratio << " times and retry tiling."; + global_attrs.Set(kUBRatio, ktvm::make_const(Float(32), adjust_ratio)); + analyzer_.logger_.AppendLog(MICRO_TUNING, ss); + } + return adjust_ratio; +} + void TilingSolver::CollectMemoryLimit() { + // Init memory allocation percentage. percentage_ = ALLOCATION_PERCENTAGE; for (auto attr : analyzer_.RootAxis()->attrs) { if (attr.attr_key != "MEM_RATIO") continue; @@ -29,9 +81,27 @@ void TilingSolver::CollectMemoryLimit() { break; } + // Handle previous error info if storage flatten fails and adjust allocation percentage. + auto error_info = global_attrs.GetStringAttr(kErrorInfo, ""); + if (!error_info.empty() && error_info.find("storage_flatten") != std::string::npos) { + std::stringstream ss; + ss << "Get Error Info! -> " << global_attrs.GetStringAttr(kErrorInfo, ""); + percentage_ = percentage_ * GetNewAllocRatioWhenFlattenFail(error_info); + ss << "Adjust memory allocation to " << percentage_ << " of memory size and retry tiling."; + global_attrs.Set(kErrorInfo, StringImm::make("")); + analyzer_.logger_.AppendLog(MICRO_TUNING, ss); + } + + // Init memory limit for each scope and reduce ratio of local.UB if storage rewrite fails previously. DavinciInfo &d_info = DavinciInfo::GetInstance(); + auto error_scope = global_attrs.GetStringAttr(kErrorScope, ""); for (auto i = 0; i < MEM_SCOPE_BULK; ++i) { this->mem_limit_[i] = d_info.GetMemoryLimitInScope(i) * percentage_; + if (i == DavinciMemScope::MEM_SCOPE_UB && error_scope == "local.UB") { + this->mem_limit_[i] = + std::max(static_cast(this->mem_limit_[i] * GetNewAllocRatioWhenRewriteFail(this->mem_limit_[i])), 1); + global_attrs.Set(kErrorScope, StringImm::make("")); + } } } diff --git a/src/poly/tiling_solver.h b/src/poly/tiling_solver.h index 54bd6ed..0f89b98 100644 --- a/src/poly/tiling_solver.h +++ b/src/poly/tiling_solver.h @@ -30,6 +30,8 @@ class TilingSolver { ~TilingSolver() {} void CollectMemoryLimit(); void CollectTileAxisTopDown(); + double GetNewAllocRatioWhenFlattenFail(const std::string &error_info); + double GetNewAllocRatioWhenRewriteFail(int64_t memory_bits); TileCandidate *Solve(); TilingAnalyzer &analyzer_; diff --git a/src/poly/tiling_utils.cc b/src/poly/tiling_utils.cc index 03e996b..93c6d15 100644 --- a/src/poly/tiling_utils.cc +++ b/src/poly/tiling_utils.cc @@ -29,6 +29,8 @@ void TileLogger::AppendLine(LogStage stage, const std::string &line) { analyze_tiling_space_stage_.emplace_back(line); } else if (stage == DO_TILING) { do_tiling_stage_.emplace_back(line); + } else if (stage == MICRO_TUNING) { + micro_tuning_strage_.emplace_back(line); } else { do_tuning_stage_.emplace_back(line); } @@ -70,6 +72,11 @@ bool TileLogger::DumpLogFile() { of << line << std::endl; } of << "=========================" << std::endl; + of << ">>>>>>>>>> Micro tuning stage <<<<<<<<<<<<" << std::endl; + for (const auto &line : micro_tuning_strage_) { + of << line << std::endl; + } + of << "=========================" << std::endl; of.close(); return true; } diff --git a/src/poly/tiling_utils.h b/src/poly/tiling_utils.h index 38650e2..6e3505e 100644 --- a/src/poly/tiling_utils.h +++ b/src/poly/tiling_utils.h @@ -32,7 +32,7 @@ enum DavinciMemScope { MEM_SCOPE_L0C, MEM_SCOPE_BULK, }; -enum LogStage { ANA_SCHETREE, ANA_BUF_LIVE_EXTENT, ANA_TILING_SPACE, DO_TILING, DO_TUNING }; +enum LogStage { ANA_SCHETREE, ANA_BUF_LIVE_EXTENT, ANA_TILING_SPACE, DO_TILING, DO_TUNING, MICRO_TUNING }; class DavinciInfo { public: @@ -89,6 +89,7 @@ class TileLogger { LogFile analyze_tiling_space_stage_; LogFile do_tiling_stage_; LogFile do_tuning_stage_; + LogFile micro_tuning_strage_; }; } // namespace poly } // namespace ir diff --git a/tests/unittest/pass/test_micro_tuning.py b/tests/unittest/pass/test_micro_tuning.py new file mode 100644 index 0000000..1ba8c99 --- /dev/null +++ b/tests/unittest/pass/test_micro_tuning.py @@ -0,0 +1,28 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""unittest for micro-tuning""" +from akg.utils import kernel_exec +from akg.ops.array import four2five + + +def test_four2five_without_custom_tiling(build_shape, dtype, op_attrs): + """This test case will fail without cunstom tiling and micro-tuning will automatically adjust tile sizes.""" + build_attr = op_attrs + [False] + return kernel_exec.op_build_test(four2five.four2five, [build_shape], [dtype], build_attr, kernel_name="four2five", attrs={}, tuning=False) + + +if __name__ == "__main__": + test_four2five_without_custom_tiling( + [32, 1001, 1, 1], "float16", ['NCHW', 'float16']) diff --git a/tests/unittest/unittest.sh b/tests/unittest/unittest.sh index 97d2291..be36c92 100644 --- a/tests/unittest/unittest.sh +++ b/tests/unittest/unittest.sh @@ -22,6 +22,7 @@ casefiles=( "pass/test_promote_if.py" "pass/test_sink_if.py" "pass/test_ir_parser.py" +"pass/test_micro_tuning.py" "pass/test_elim_vector_mask.py" "pass/test_copy_propagation.py" "pass/test_utils_detect_non_linear_index.py" -- GitLab