!53 support dynamical memory allocation ratio adjustment in micro-tuning for...

!53 support dynamical memory allocation ratio adjustment in micro-tuning for allocation exceed problem Merge pull request !53 from yangsijia/feature/micro-tuning

!53 support dynamical memory allocation ratio adjustment in micro-tuning for...
!53 support dynamical memory allocation ratio adjustment in micro-tuning for allocation exceed problem Merge pull request !53 from yangsijia/feature/micro-tuning
4d1be48d · mindspore-ci-bot · Gitee · 456d200d · aaf77021 · 4d1be48d
12 changed file
--- a/python/akg/ops/array/four2five.py
+++ b/python/akg/ops/array/four2five.py
@@ -116,8 +116,8 @@ def four2five_tiling_strategy_dynamic(tensor, input_format):
            strategy.append(ct_util.create_constraint_on_tensor(tensor, 16, ct_util.TileConstraint.FACTOR, 4)[0])
    return strategy

-@vc_util.check_input_type(akg.tvm.tensor.Tensor, str, str)
-def four2five(data, format_, dst_dtype='float16'):
+@vc_util.check_input_type(akg.tvm.tensor.Tensor, str, str, bool)
+def four2five(data, format_, dst_dtype='float16', need_custom_tiling=True):
    """
    Convert 4-dims "data" to 5-dims,the format of "data" is defined in "format_"

@@ -294,8 +294,9 @@ def four2five(data, format_, dst_dtype='float16'):
        dim_info, _ = four2five_set_dim_func(data, format_, dst_dtype)
        if dim_info != "":
            attrs["dim"] = dim_info
-        attrs["custom_tiling"] = four2five_tiling_strategy(output, format_, expansion)
-    else:
+        if need_custom_tiling:
+            attrs["custom_tiling"] = four2five_tiling_strategy(output, format_, expansion)
+    elif need_custom_tiling:
        attrs["custom_tiling"] = four2five_tiling_strategy_dynamic(output, format_)

    if is_dynamic:

--- a/src/codegen/build_module.cc
+++ b/src/codegen/build_module.cc
@@ -458,7 +458,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
  PassTimer *pass_timer = PassTimer::GetInstance();
  global_attrs.Set(kKernelName, StringImm::make(name));

-  global_attrs.Set(kDumpPassIr, ktvm::make_const(Int(1), config->dump_pass_ir));
+  global_attrs.Set(kDumpPassIr, ktvm::make_const(Int(32), config->dump_pass_ir));
  if (config->dump_pass_ir) {
    std::string dump_ir_dir;
    if (global_attrs.GetStringAttr(kDumpIrDir, &dump_ir_dir)) {
@@ -498,7 +498,7 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
  stmt = NEXT_PASS(RenameRealize, stmt, binds_0, replace);

  bool is_dynamic = !shape_vars.empty();
-  global_attrs.Set(kIsDynamic, ktvm::make_const(Int(1), is_dynamic));
+  global_attrs.Set(kIsDynamic, ktvm::make_const(Int(32), is_dynamic));

  Array<NodeRef> arg_list_1;
  Map<Tensor, Buffer> binds_1;
@@ -594,227 +594,255 @@ NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef>
      NodeRef tuning_spaces = NEXT_PASS(GenTuningSpace, stmt, binds_0, attrs_1, false);
      return tuning_spaces;
    }
-    Array<NodeRef> poly_res = NEXT_PASS(AutoPoly, stmt, binds_0, global_attrs, false, is_dynamic);
-    CHECK_EQ(poly_res.size(), 2);
-    stmt = ktvm::Downcast<Stmt>(poly_res[0]);
-    Array<ktvm::Var> tiling_params = ktvm::Downcast<Array<ktvm::Var>>(poly_res[1]);
-    for (const auto &var : tiling_params) {
-      arg_list_0.push_back(var);
-    }
+  }

-    if (global_attrs.GetBoolAttr(kTileSizeIsVar, false)) {
-      Array<NodeRef> arg_list_2;
-      Map<Tensor, Buffer> binds_2;
-      FixParametricBinds(binds_0, arg_list_0, config, &binds_2, &arg_list_2);
-      stmt = NEXT_PASS(FixBindBuffer, stmt, binds_2);
-      arg_list_0 = arg_list_2;
-      binds_0 = binds_2;
-    }
+  // micro-tuning configs: current strategy is to retry autopoly up to 3 times when storage flatten/rewrite fails
+  bool need_micro_tuning = !aicpu && polyhedral && !is_dynamic && global_attrs.GetStringAttr("dim", "").empty();
+  const int max_enter_poly_times = global_attrs.GetIntAttr(kMaxNumRetryPoly, need_micro_tuning ? 4 : 1);
+  int enter_count = 0;
+  Stmt stmt_before_poly = stmt;
+  while (enter_count < max_enter_poly_times) {
+    if (!aicpu && polyhedral) {
+      Array<NodeRef> poly_res = NEXT_PASS(AutoPoly, stmt_before_poly, binds_0, global_attrs, false, is_dynamic);
+      enter_count++;
+      CHECK_EQ(poly_res.size(), 2);
+      stmt = ktvm::Downcast<Stmt>(poly_res[0]);
+      Array<ktvm::Var> tiling_params = ktvm::Downcast<Array<ktvm::Var>>(poly_res[1]);
+      for (const auto &var : tiling_params) {
+        arg_list_0.push_back(var);
+      }

-    if (is_dynamic) {
-      if (global_attrs.GetBoolAttr(kEnableSubstituteDivVar, false)) {
-        stmt = NEXT_PASS(SubstituteDivVar, stmt);
+      if (global_attrs.GetBoolAttr(kTileSizeIsVar, false)) {
+        Array<NodeRef> arg_list_2;
+        Map<Tensor, Buffer> binds_2;
+        FixParametricBinds(binds_0, arg_list_0, config, &binds_2, &arg_list_2);
+        stmt = NEXT_PASS(FixBindBuffer, stmt, binds_2);
+        arg_list_0 = arg_list_2;
+        binds_0 = binds_2;
      }

-      // fix var addresses because poly identify vars by name
-      stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0);
-      // isolate dynamic tile loops (isolate body and tail)
-      if (global_attrs.GetBoolAttr(kEnableIsolateLoop, true)) {
-        stmt = NEXT_PASS(IsolateLoops, stmt, global_attrs.GetBoolAttr(kEnableIsolateMinMax, false));
-        stmt = NEXT_PASS(PromoteLetStmt, stmt, arg_list_0);
+      if (is_dynamic) {
+        if (global_attrs.GetBoolAttr(kEnableSubstituteDivVar, false)) {
+          stmt = NEXT_PASS(SubstituteDivVar, stmt);
+        }
+
+        // fix var addresses because poly identify vars by name
+        stmt = NEXT_PASS(UnifyLoopVars, stmt, binds_0, arg_list_0);
+        // isolate dynamic tile loops (isolate body and tail)
+        if (global_attrs.GetBoolAttr(kEnableIsolateLoop, true)) {
+          stmt = NEXT_PASS(IsolateLoops, stmt, global_attrs.GetBoolAttr(kEnableIsolateMinMax, false));
+          stmt = NEXT_PASS(PromoteLetStmt, stmt, arg_list_0);
+        }
      }
-    }

-    // pls do not insert pass between AutoPoly and cube special pass.
-    // cube special pass begin
-    stmt = NEXT_PASS(ExprPatternRewrite, stmt);
-    stmt = NEXT_PASS(AutoMadPragmaAttr, stmt);
-    stmt = NEXT_PASS(PostFusion, stmt, binds_0, is_dynamic);
-    stmt = NEXT_PASS(ReduceFusionOpt, stmt, binds_0);
-    stmt = NEXT_PASS(PostProcessImg2col, stmt);
-    stmt = NEXT_PASS(PromoteIfStmt, stmt, is_dynamic);
-    stmt = NEXT_PASS(BypassL1, stmt);
-    if (global_attrs.GetBoolAttr(kEnableStrideKernelOp, true)) {
-      stmt = NEXT_PASS(StrideKernelOp, stmt, binds_0, is_dynamic);
-    }
-    stmt = NEXT_PASS(Load3dTrans, stmt, is_dynamic);
-    // cube special pass end
-    stmt = NEXT_PASS(CopyPropagation, stmt, binds_0);
-    stmt = NEXT_PASS(ConvertCondToExtent, stmt);
-    bool enable_convert_if = global_attrs.GetBoolAttr(kEnableConvertIf, false);
-    if (enable_convert_if) {
-      stmt = NEXT_PASS(FixRealizeShape, stmt);
-    }
-    if (global_attrs.GetBoolAttr(kEnableDmaSink, false)) {
-      stmt = NEXT_PASS(DMASink, stmt);
-    }
+      // pls do not insert pass between AutoPoly and cube special pass.
+      // cube special pass begin
+      stmt = NEXT_PASS(ExprPatternRewrite, stmt);
+      stmt = NEXT_PASS(AutoMadPragmaAttr, stmt);
+      stmt = NEXT_PASS(PostFusion, stmt, binds_0, is_dynamic);
+      stmt = NEXT_PASS(ReduceFusionOpt, stmt, binds_0);
+      stmt = NEXT_PASS(PostProcessImg2col, stmt);
+      stmt = NEXT_PASS(PromoteIfStmt, stmt, is_dynamic);
+      stmt = NEXT_PASS(BypassL1, stmt);
+      if (global_attrs.GetBoolAttr(kEnableStrideKernelOp, true)) {
+        stmt = NEXT_PASS(StrideKernelOp, stmt, binds_0, is_dynamic);
+      }
+      stmt = NEXT_PASS(Load3dTrans, stmt, is_dynamic);
+      // cube special pass end
+      stmt = NEXT_PASS(CopyPropagation, stmt, binds_0);
+      stmt = NEXT_PASS(ConvertCondToExtent, stmt);
+      bool enable_convert_if = global_attrs.GetBoolAttr(kEnableConvertIf, false);
+      if (enable_convert_if) {
+        stmt = NEXT_PASS(FixRealizeShape, stmt);
+      }
+      if (global_attrs.GetBoolAttr(kEnableDmaSink, false)) {
+        stmt = NEXT_PASS(DMASink, stmt);
+      }

-    stmt = NEXT_PASS(LowerWith, stmt);
-    stmt = NEXT_PASS(ForEliminate, stmt);
-    stmt = NEXT_PASS(RealizeCompress, stmt);
+      stmt = NEXT_PASS(LowerWith, stmt);
+      stmt = NEXT_PASS(ForEliminate, stmt);
+      stmt = NEXT_PASS(RealizeCompress, stmt);

-    if (!global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) {
-      stmt = NEXT_PASS(LoopNormlize, stmt);
+      if (!global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) {
+        stmt = NEXT_PASS(LoopNormlize, stmt);
+      }
+      stmt = NEXT_PASS(PoolingTransform, stmt, is_dynamic);
+      stmt = NEXT_PASS(InjectAttr, stmt);
+      stmt = NEXT_PASS(ModDivEliminate, stmt);
+      if (enable_convert_if) {
+        stmt = NEXT_PASS(AlignLastAxisLoopExtent, stmt, binds_0);
+        stmt = NEXT_PASS(FixLoopExtent, stmt);
+        stmt = NEXT_PASS(ConvertIfToSelect, stmt);
+      }
    }
-    stmt = NEXT_PASS(PoolingTransform, stmt, is_dynamic);
-    stmt = NEXT_PASS(InjectAttr, stmt);
-    stmt = NEXT_PASS(ModDivEliminate, stmt);
-    if (enable_convert_if) {
-      stmt = NEXT_PASS(AlignLastAxisLoopExtent, stmt, binds_0);
-      stmt = NEXT_PASS(FixLoopExtent, stmt);
-      stmt = NEXT_PASS(ConvertIfToSelect, stmt);
+    try {
+      stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64);
+    } catch (const std::runtime_error &e) {
+      if (enter_count >= max_enter_poly_times) {
+        CHECK(false) << e.what();
+      }
+      global_attrs.Set(kErrorInfo, StringImm::make(e.what()));
+      continue;
+    }
+    stmt = NEXT_PASS(DmaFlatten, stmt, global_attrs.GetBoolAttr(kTileSizeIsVar, false));
+    if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) {
+      stmt = NEXT_PASS(AlgebraSimplify, stmt);
+    }
+    if (is_dynamic) {
+      stmt = NEXT_PASS(UnifyAllocate, stmt);
    }
-  }
-
-  stmt = NEXT_PASS(StorageFlatten, stmt, binds_0, 64);
-  stmt = NEXT_PASS(DmaFlatten, stmt, global_attrs.GetBoolAttr(kTileSizeIsVar, false));
-  if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) {
-    stmt = NEXT_PASS(AlgebraSimplify, stmt);
-  }
-  if (is_dynamic) {
-    stmt = NEXT_PASS(UnifyAllocate, stmt);
-  }

-  if (global_attrs.GetBoolAttr(kEleminateOutmostForCond, false)) {
-    stmt = NEXT_PASS(PreProcess4Multicore, stmt);
-  }
+    if (global_attrs.GetBoolAttr(kEleminateOutmostForCond, false)) {
+      stmt = NEXT_PASS(PreProcess4Multicore, stmt);
+    }

-  int enable_multicore = global_attrs.GetIntAttr(kEnableMulticore, 1);
-  if (!is_dynamic && enable_multicore != 0 && global_attrs.GetBoolAttr(kMultiCoreLoopSwitchHoist, true)) {
-    stmt = NEXT_PASS(MultiCoreLoopSwitchHoist, stmt);
-  }
-  stmt = NEXT_PASS(LoopSwitchHoist, stmt, global_attrs.GetIntAttr(kEnableHoistAllocate, false));
+    int enable_multicore = global_attrs.GetIntAttr(kEnableMulticore, 1);
+    if (!is_dynamic && enable_multicore != 0 && global_attrs.GetBoolAttr(kMultiCoreLoopSwitchHoist, true)) {
+      stmt = NEXT_PASS(MultiCoreLoopSwitchHoist, stmt);
+    }
+    stmt = NEXT_PASS(LoopSwitchHoist, stmt, global_attrs.GetIntAttr(kEnableHoistAllocate, false));

-  // Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning,
-  //                       4 : whether to partition convolution or not
-  if (!aicpu && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) {
-    stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral);
-  }
+    // Loop Partition args : 2 : split_const_loop, 3 : remove Div / Mod ops by partitioning,
+    //                       4 : whether to partition convolution or not
+    if (!aicpu && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true)) {
+      stmt = NEXT_PASS(LoopPartitionCCE, stmt, true, false, !polyhedral);
+    }

-  if (polyhedral && global_attrs.GetBoolAttr(kEnableSinkAllocate, true)) {
-    stmt = NEXT_PASS(SinkAllocate, stmt);
-  }
+    if (polyhedral && global_attrs.GetBoolAttr(kEnableSinkAllocate, true)) {
+      stmt = NEXT_PASS(SinkAllocate, stmt);
+    }

-  if (global_attrs.GetBoolAttr(kLoopPartitionUnroll, false)) {
-    // For the Manual scheduling or When polyhedral is not used
-    stmt = NEXT_PASS(UnrollNonConstantExtent, stmt);
-  }
-  if (!polyhedral) {
-    // fix mad attributes and remove dead computations for the manual schedule
-    stmt = NEXT_PASS(FixMadAttrs, stmt);
-  }
-  if (!is_dynamic) {
-    stmt = NEXT_PASS(CanonicalSimplify, stmt);
-  }
-  stmt = NEXT_PASS(ForEliminate, stmt);
-  if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) {
-    stmt = NEXT_PASS(AlgebraSimplify, stmt);
-  }
-  if (!is_dynamic) {
-    stmt = NEXT_PASS(FixLoopExtent, stmt);
-  }
+    if (global_attrs.GetBoolAttr(kLoopPartitionUnroll, false)) {
+      // For the Manual scheduling or When polyhedral is not used
+      stmt = NEXT_PASS(UnrollNonConstantExtent, stmt);
+    }
+    if (!polyhedral) {
+      // fix mad attributes and remove dead computations for the manual schedule
+      stmt = NEXT_PASS(FixMadAttrs, stmt);
+    }
+    if (!is_dynamic) {
+      stmt = NEXT_PASS(CanonicalSimplify, stmt);
+    }
+    stmt = NEXT_PASS(ForEliminate, stmt);
+    if (global_attrs.GetBoolAttr(kAlgebraSimplify, false) && is_dynamic) {
+      stmt = NEXT_PASS(AlgebraSimplify, stmt);
+    }
+    if (!is_dynamic) {
+      stmt = NEXT_PASS(FixLoopExtent, stmt);
+    }

-  if (!aicpu) {
-    stmt = NEXT_PASS(AutoPragma, stmt);
-  }
-  stmt = NEXT_PASS(EliminateAtomicDma, stmt);
-  if (global_attrs.GetBoolAttr(kDeadCodeElim, false)) {
-    stmt = NEXT_PASS(DeadCodeElim, stmt);
-  }
-  if (!is_dynamic) {
-    stmt = NEXT_PASS(RewriteBroadcastVector, stmt);
-    stmt = NEXT_PASS(OptimizePragma, stmt);
-  }
-  if (is_dynamic) {
-    stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true),
-                     global_attrs.GetIntAttr(kEnableScalarAlign, false));
-  } else {
-    stmt = NEXT_PASS(AnalyzeMinAlignStatic, stmt);
-  }
-  stmt = NEXT_PASS(MultiLastAxisReductions, stmt, is_dynamic);
-  stmt = NEXT_PASS(AutoReorder, stmt);
-  if (enable_multicore != 0) {
-    if (is_dynamic && enable_multicore == 1) {
-      Var block_dim = Variable::make(Int(32), "blockDim");
-      Array<NodeRef> multicore_res =
-        NEXT_PASS(InjectMultiCoreVar, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0));
-      CHECK_EQ(multicore_res.size(), 2);
-      stmt = ktvm::Downcast<Stmt>(multicore_res[0]);
-      auto extent_thread = ktvm::Downcast<Integer>(multicore_res[1]);
-      if (extent_thread.as<IntImm>()->value == -1) {
-        arg_list_0.push_back(block_dim);
-      }
+    if (!aicpu) {
+      stmt = NEXT_PASS(AutoPragma, stmt);
+    }
+    stmt = NEXT_PASS(EliminateAtomicDma, stmt);
+    if (global_attrs.GetBoolAttr(kDeadCodeElim, false)) {
+      stmt = NEXT_PASS(DeadCodeElim, stmt);
+    }
+    if (!is_dynamic) {
+      stmt = NEXT_PASS(RewriteBroadcastVector, stmt);
+      stmt = NEXT_PASS(OptimizePragma, stmt);
+    }
+    if (is_dynamic) {
+      stmt = NEXT_PASS(AnalyzeMinAlignDynamic, stmt, global_attrs.GetIntAttr(kEnableConvAnalyzeAlign, true),
+                       global_attrs.GetIntAttr(kEnableScalarAlign, false));
    } else {
-      int block_dim = enable_multicore == 1 ? -1 : enable_multicore;
-      stmt = NEXT_PASS(InjectMultiCore, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0), is_dynamic,
-                       global_attrs.GetBoolAttr(kMultiCoreScalarRerrange, false));
+      stmt = NEXT_PASS(AnalyzeMinAlignStatic, stmt);
+    }
+    stmt = NEXT_PASS(MultiLastAxisReductions, stmt, is_dynamic);
+    stmt = NEXT_PASS(AutoReorder, stmt);
+    if (enable_multicore != 0) {
+      if (is_dynamic && enable_multicore == 1) {
+        Var block_dim = Variable::make(Int(32), "blockDim");
+        Array<NodeRef> multicore_res =
+          NEXT_PASS(InjectMultiCoreVar, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0));
+        CHECK_EQ(multicore_res.size(), 2);
+        stmt = ktvm::Downcast<Stmt>(multicore_res[0]);
+        auto extent_thread = ktvm::Downcast<Integer>(multicore_res[1]);
+        if (extent_thread.as<IntImm>()->value == -1) {
+          arg_list_0.push_back(block_dim);
+        }
+      } else {
+        int block_dim = enable_multicore == 1 ? -1 : enable_multicore;
+        stmt = NEXT_PASS(InjectMultiCore, stmt, block_dim, global_attrs.GetIntAttr(kMergeOuterLoop, 0), is_dynamic,
+                         global_attrs.GetBoolAttr(kMultiCoreScalarRerrange, false));
+      }
+    }
+    if (!is_dynamic) {
+      RecordCore(stmt, global_attrs.GetBoolAttr(kRecordCore, false));
+    }
+    stmt = NEXT_PASS(SelectLower, stmt);
+    stmt = NEXT_PASS(ReplaceFargmaxCasts, stmt);
+    if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) {
+      stmt = NEXT_PASS(GatherLoopInfo, stmt);
+    }
+    stmt = NEXT_PASS(CastFilter, stmt);
+    if (!is_dynamic) {
+      stmt = NEXT_PASS(SplitTail, stmt);
+    }
+    stmt = NEXT_PASS(EmitInsn, stmt, global_attrs.GetBoolAttr(kEnableBisectOptimize, true),
+                     global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true), binds_0, is_dynamic);
+    // must be after EmitInsn
+    stmt = NEXT_PASS(TileCoverCorrect, stmt);
+    if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) {
+      // simulated blocks > 2 400 000 => simulated case takes too much time (> 100 sec)
+      // number of protections > 512 => too many brackets in the if statement throw an error
+      stmt = NEXT_PASS(CoverProtection, stmt, 2400000, 512);
+    }
+    stmt = NEXT_PASS(ConvertDivModToShift, stmt);
+    if (!polyhedral || global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) {
+      // for conv manual schedule and load3d
+      stmt = NEXT_PASS(CoarsenImg2Col, stmt);
+    }
+    stmt = NEXT_PASS(DTypeAdapter, stmt);
+    if (global_attrs.GetBoolAttr(kEnableHoistInsn, true)) {
+      stmt = NEXT_PASS(HoistInsn, stmt);
+    }
+    // temp disable InvariantHoist for dynamic shape because it may move LetStmt out of scope
+    if (global_attrs.GetBoolAttr(kEnableInvariantHoist, true)) {
+      stmt = NEXT_PASS(InvariantHoist, stmt);
+    }
+    stmt = NEXT_PASS(SetVectorMaskDefault, stmt);
+    stmt = NEXT_PASS(ElimVectorMask, stmt);
+    stmt = NEXT_PASS(ElimDMA, stmt);
+    if (!is_dynamic) {
+      stmt = NEXT_PASS(MultiCorePartition, stmt);
    }
-  }
-  if (!is_dynamic) {
-    RecordCore(stmt, global_attrs.GetBoolAttr(kRecordCore, false));
-  }
-  stmt = NEXT_PASS(SelectLower, stmt);
-  stmt = NEXT_PASS(ReplaceFargmaxCasts, stmt);
-  if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) {
-    stmt = NEXT_PASS(GatherLoopInfo, stmt);
-  }
-  stmt = NEXT_PASS(CastFilter, stmt);
-  if (!is_dynamic) {
-    stmt = NEXT_PASS(SplitTail, stmt);
-  }
-  stmt = NEXT_PASS(EmitInsn, stmt, global_attrs.GetBoolAttr(kEnableBisectOptimize, true),
-                   global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true), binds_0, is_dynamic);
-  // must be after EmitInsn
-  stmt = NEXT_PASS(TileCoverCorrect, stmt);
-  if (global_attrs.GetBoolAttr(kEnableCoverProtectOptimize, true) && !is_dynamic) {
-    // simulated blocks > 2 400 000 => simulated case takes too much time (> 100 sec)
-    // number of protections > 512 => too many brackets in the if statement throw an error
-    stmt = NEXT_PASS(CoverProtection, stmt, 2400000, 512);
-  }
-  stmt = NEXT_PASS(ConvertDivModToShift, stmt);
-  if (!polyhedral || global_attrs.GetBoolAttr(kCoarsenImg2Col, false)) {
-    // for conv manual schedule and load3d
-    stmt = NEXT_PASS(CoarsenImg2Col, stmt);
-  }
-  stmt = NEXT_PASS(DTypeAdapter, stmt);
-  if (global_attrs.GetBoolAttr(kEnableHoistInsn, true)) {
-    stmt = NEXT_PASS(HoistInsn, stmt);
-  }
-  // temp disable InvariantHoist for dynamic shape because it may move LetStmt out of scope
-  if (global_attrs.GetBoolAttr(kEnableInvariantHoist, true)) {
-    stmt = NEXT_PASS(InvariantHoist, stmt);
-  }
-  stmt = NEXT_PASS(SetVectorMaskDefault, stmt);
-  stmt = NEXT_PASS(ElimVectorMask, stmt);
-  stmt = NEXT_PASS(ElimDMA, stmt);
-  if (!is_dynamic) {
-    stmt = NEXT_PASS(MultiCorePartition, stmt);
-  }

-  if (global_attrs.GetBoolAttr(kEnableDoubleBuffer, true)) {
-    stmt = NEXT_PASS(AutoDoubleBuffer, stmt);
-  }
-  stmt = NEXT_PASS(InjectAccessPtrMSG, stmt);
-  if (!aicpu) {
-    stmt = NEXT_PASS(InjectPipe, stmt);
-  }
-  stmt = NEXT_PASS(ModDivEliminate, stmt);
+    if (global_attrs.GetBoolAttr(kEnableDoubleBuffer, true)) {
+      stmt = NEXT_PASS(AutoDoubleBuffer, stmt);
+    }
+    stmt = NEXT_PASS(InjectAccessPtrMSG, stmt);
+    if (!aicpu) {
+      stmt = NEXT_PASS(InjectPipe, stmt);
+    }
+    stmt = NEXT_PASS(ModDivEliminate, stmt);

-  // Phase 2
-  if (!simple_mode && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true) && !is_dynamic) {
-    stmt = NEXT_PASS(LoopPartitionCCE, stmt, config->partition_const_loop, true, !polyhedral);
-  }
-  if (global_attrs.GetBoolAttr(kEnablePreStorageWriteSimplify, false)) {
-    stmt = NEXT_PASS(AlgebraSimplify, stmt);
+    // Phase 2
+    if (!simple_mode && global_attrs.GetBoolAttr(kEnablePostPolyLoopPartition, true) && !is_dynamic) {
+      stmt = NEXT_PASS(LoopPartitionCCE, stmt, config->partition_const_loop, true, !polyhedral);
+    }
+    if (global_attrs.GetBoolAttr(kEnablePreStorageWriteSimplify, false)) {
+      stmt = NEXT_PASS(AlgebraSimplify, stmt);
+    }
+    std::string maxsat_filename = global_attrs.GetStringAttr(kMaxsatFile, std::string());
+    // attempt to optimize UB memory layout to reduce bank conflicts and pipeline conflicts
+    bool use_bc_opt = global_attrs.GetBoolAttr(kUseBcOpt, true);
+    // run MaxSAT solver for bank conflicts with no limits on model size or runtime
+    bool bc_no_limits = false;
+    // timeout for MaxSAT solver in seconds (int)
+    int maxsat_timeout = 4;
+    try {
+      stmt = NEXT_PASS(StorageRewriteCCE, stmt, maxsat_filename, use_bc_opt, bc_no_limits, maxsat_timeout);
+    } catch (MemoryAllocationException &e) {
+      if (enter_count >= max_enter_poly_times) {
+        CHECK(false) << e.what();
+      }
+      global_attrs.Set(kAllocBits, ktvm::make_const(Int(32), e.alloc_bits_ + e.need_bits_));
+      global_attrs.Set(kErrorScope, StringImm::make(e.scope_));
+      continue;
+    }
+    break;
  }
-  std::string maxsat_filename = global_attrs.GetStringAttr(kMaxsatFile, std::string());
-  // attempt to optimize UB memory layout to reduce bank conflicts and pipeline conflicts
-  bool use_bc_opt = global_attrs.GetBoolAttr(kUseBcOpt, true);
-  // run MaxSAT solver for bank conflicts with no limits on model size or runtime
-  bool bc_no_limits = false;
-  // timeout for MaxSAT solver in seconds (int)
-  int maxsat_timeout = 4;
-  stmt = NEXT_PASS(StorageRewriteCCE, stmt, maxsat_filename, use_bc_opt, bc_no_limits, maxsat_timeout);

  if (!is_dynamic)
    stmt = NEXT_PASS(UnrollLoop, stmt, config->auto_unroll_max_step, config->auto_unroll_max_depth,

--- a/src/codegen/util.cc
+++ b/src/codegen/util.cc
@@ -98,7 +98,13 @@ int AttrMap::GetIntAttr(const std::string &attr_name, int dft_value) {
  const NodeRef &e = this->at(attr_name);
  return ir::GetInt32Const(Downcast<Expr>(e));
 }
-
+double AttrMap::GetFloatAttr(const std::string &attr_name, double dft_value) {
+  if (this->count(attr_name) == 0) {
+    return dft_value;
+  }
+  const NodeRef &e = this->at(attr_name);
+  return ir::GetFloatConst(Downcast<Expr>(e));
+}
 bool AttrMap::GetBoolAttr(const std::string &attr_name, bool dft_value) {
  int result = GetIntAttr(attr_name, static_cast<int>(dft_value));
  CHECK(result == 0 || result == 1) << "Bool attribute " << attr_name << " must be 0 or 1, but found "

--- a/src/codegen/util.h
+++ b/src/codegen/util.h
@@ -91,6 +91,11 @@ constexpr auto kEnableRemoveBroadcastCopy = "enable_remove_broadcast_copy";
 constexpr auto kEnableSubstituteDivVar = "enable_divide_var";
 constexpr auto kEnableComputeInPlace = "enable_compute_in_place";
 constexpr auto kEnableRewriteScalarCompute = "enable_rewrite_scalar_compute";
+constexpr auto kMaxNumRetryPoly = "max_num_retry_poly";
+constexpr auto kUBRatio = "ub_ratio";
+constexpr auto kErrorInfo = "";
+constexpr auto kErrorScope = "";
+constexpr auto kAllocBits = "alloc_bits";

 static std::unordered_map<std::string, int> help_tiling_level = {
  {"None", 0},
@@ -109,7 +114,7 @@ class AttrMap : public Map<std::string, NodeRef> {

  bool GetBoolAttr(const std::string &attr_name, bool dft_value);
  int GetIntAttr(const std::string &attr_name, int dft_value);
-
+  double GetFloatAttr(const std::string &attr_name, double dft_value);
  bool GetStringAttr(const std::string &attr_name, std::string *attr_to_set);
  std::string GetStringAttr(const std::string &attr_name, const std::string &dft_value);
 };

--- a/src/include/build_module.h
+++ b/src/include/build_module.h
@@ -18,11 +18,33 @@
 #define INCLUDE_AKG_BUILD_MODULE_H_

 #include <string>
+#include <exception>

 #include "codegen/util.h"

 namespace akg {
 extern AttrMap global_attrs;
+
+/*
+ * Custom exception used when memory allocation fails and triggers micro-tuning to try to recover from failure.
+ */
+class MemoryAllocationException : public std::exception {
+ public:
+  MemoryAllocationException(const std::string &scope, uint64_t need_bits, uint64_t alloc_bits)
+      : scope_(scope), need_bits_(need_bits), alloc_bits_(alloc_bits){};
+
+  const char *what() const throw() {
+    std::runtime_error re(("Allocation exceed bound of memory tag " + scope_ + ": need " + std::to_string(need_bits_) +
+                           " bits, total alloc " + std::to_string(alloc_bits_) + " bits.")
+                            .c_str());
+    return re.what();
+  }
+
+  std::string scope_{""};
+  uint64_t need_bits_{0};
+  uint64_t alloc_bits_{0};
+};
+
 NodeRef Lower(Schedule sch, const Array<NodeRef> &in_args, const Array<NodeRef> &shape_vars, const std::string &name,
              const Map<Tensor, Buffer> &in_binds, const Map<std::string, NodeRef> &in_attrs, bool simple_mode,
              bool polyhedral, bool tuning, bool aicpu, const BuildConfig &config);

--- a/src/pass/storage_rewrite_cce.cc
+++ b/src/pass/storage_rewrite_cce.cc
@@ -26,6 +26,7 @@
 #include <regex>

 #include "ir_pass.h"
+#include "build_module.h"
 #include "pass/ir_util.h"
 #include "emit_insn/insn_info.h"
 #include "pass/storage_rewrite_cce.h"
@@ -1146,8 +1147,7 @@ bool StoragePlanRewriterCCE::DoRewrite(const std::string scope, std::vector<std:
      }
      if (spec_level <= 0 || child_idx < 0) {
        if (!is_dynamic_) {
-          LOG(FATAL) << "Allocation exceed bound of memory tag " << scope << ": need " << need_nbits
-                     << " bits, total alloc " << total_alloc_bits << " bits";
+          throw MemoryAllocationException(scope, need_nbits, total_alloc_bits);
        } else {
          LOG(WARNING) << "Dynamic shape static allocation exceed bound of memory tag " << scope << ": need "
                       << need_nbits << " bits, will use dynamic allocation instead";

--- a/src/poly/tiling_solver.cc
+++ b/src/poly/tiling_solver.cc
@@ -16,11 +16,63 @@
 */

 #include "poly/tiling_solver.h"
-
+#include "build_module.h"
 namespace akg {
 namespace ir {
 namespace poly {
+
+/*
+ * This function parse StorageFlatten error info into a ratio that guides the auto tiling to reduce
+ * memory allocation.
+ * e.g.
+ *  error info : Check failed: const_size * op->type.bits() <= info->max_num_bits (5242880 vs. 2097152) :
+ *               Allocation exceed bound of memory tag local.UB.
+ *  ratio      : memory_size / alloc_size = (2097152 / 5242880) = 0.4, which means the total allocation
+ *               size used in auto tiling shoulde reduce 0.4 times.
+ */
+double TilingSolver::GetNewAllocRatioWhenFlattenFail(const std::string &error_info) {
+  std::vector<std::string> sub_strs;
+  sub_strs = akg::common::Split(error_info, "(");
+  CHECK_GE(sub_strs.size(), 2U);
+  std::string tmp_str = sub_strs[2];
+  sub_strs = akg::common::Split(tmp_str, " ");
+  CHECK(!sub_strs.empty());
+  auto alloc_bits = static_cast<double>(std::strtod(sub_strs[0].c_str(), nullptr));
+
+  sub_strs = akg::common::Split(error_info, ")");
+  CHECK_GE(sub_strs.size(), 1U);
+  tmp_str = sub_strs[1];
+  sub_strs = akg::common::Split(tmp_str, " ");
+  CHECK(!sub_strs.empty());
+  auto memory_bits = static_cast<double>(std::strtod(sub_strs.back().c_str(), nullptr));
+
+  CHECK_NE(alloc_bits, 0);
+  return memory_bits / alloc_bits;
+}
+
+/*
+ * This function returns an adjust ratio that further reduces the memory allocation limit apart from
+ * the default percentage reserved for auto double buffer and try to generate smaller tile sizes that
+ * helps to recover from memory allocation failure such as the one in storage rewrite cce pass.
+ */
+double TilingSolver::GetNewAllocRatioWhenRewriteFail(int64_t memory_bits) {
+  auto actual_allocs = global_attrs.GetFloatAttr(kAllocBits, 0.0);
+  auto last_adjust_ratio = global_attrs.GetFloatAttr(kUBRatio, 1.0);
+  auto adjust_ratio = 1.0;
+
+  if (actual_allocs != 0) {
+    std::stringstream ss;
+    auto expect_allocs = memory_bits * last_adjust_ratio;
+    adjust_ratio = (expect_allocs / actual_allocs);
+    ss << "Adjust memory allocation ratio to " << adjust_ratio << " times and retry tiling.";
+    global_attrs.Set(kUBRatio, ktvm::make_const(Float(32), adjust_ratio));
+    analyzer_.logger_.AppendLog(MICRO_TUNING, ss);
+  }
+  return adjust_ratio;
+}
+
 void TilingSolver::CollectMemoryLimit() {
+  // Init memory allocation percentage.
  percentage_ = ALLOCATION_PERCENTAGE;
  for (auto attr : analyzer_.RootAxis()->attrs) {
    if (attr.attr_key != "MEM_RATIO") continue;
@@ -29,9 +81,27 @@ void TilingSolver::CollectMemoryLimit() {
    break;
  }

+  // Handle previous error info if storage flatten fails and adjust allocation percentage.
+  auto error_info = global_attrs.GetStringAttr(kErrorInfo, "");
+  if (!error_info.empty() && error_info.find("storage_flatten") != std::string::npos) {
+    std::stringstream ss;
+    ss << "Get Error Info! -> " << global_attrs.GetStringAttr(kErrorInfo, "");
+    percentage_ = percentage_ * GetNewAllocRatioWhenFlattenFail(error_info);
+    ss << "Adjust memory allocation to " << percentage_ << " of memory size and retry tiling.";
+    global_attrs.Set(kErrorInfo, StringImm::make(""));
+    analyzer_.logger_.AppendLog(MICRO_TUNING, ss);
+  }
+
+  // Init memory limit for each scope and reduce ratio of local.UB if storage rewrite fails previously.
  DavinciInfo &d_info = DavinciInfo::GetInstance();
+  auto error_scope = global_attrs.GetStringAttr(kErrorScope, "");
  for (auto i = 0; i < MEM_SCOPE_BULK; ++i) {
    this->mem_limit_[i] = d_info.GetMemoryLimitInScope(i) * percentage_;
+    if (i == DavinciMemScope::MEM_SCOPE_UB && error_scope == "local.UB") {
+      this->mem_limit_[i] =
+        std::max(static_cast<int>(this->mem_limit_[i] * GetNewAllocRatioWhenRewriteFail(this->mem_limit_[i])), 1);
+      global_attrs.Set(kErrorScope, StringImm::make(""));
+    }
  }
 }


--- a/src/poly/tiling_solver.h
+++ b/src/poly/tiling_solver.h
@@ -30,6 +30,8 @@ class TilingSolver {
  ~TilingSolver() {}
  void CollectMemoryLimit();
  void CollectTileAxisTopDown();
+  double GetNewAllocRatioWhenFlattenFail(const std::string &error_info);
+  double GetNewAllocRatioWhenRewriteFail(int64_t memory_bits);
  TileCandidate *Solve();

  TilingAnalyzer &analyzer_;

--- a/src/poly/tiling_utils.cc
+++ b/src/poly/tiling_utils.cc
@@ -29,6 +29,8 @@ void TileLogger::AppendLine(LogStage stage, const std::string &line) {
    analyze_tiling_space_stage_.emplace_back(line);
  } else if (stage == DO_TILING) {
    do_tiling_stage_.emplace_back(line);
+  } else if (stage == MICRO_TUNING) {
+    micro_tuning_strage_.emplace_back(line);
  } else {
    do_tuning_stage_.emplace_back(line);
  }
@@ -70,6 +72,11 @@ bool TileLogger::DumpLogFile() {
    of << line << std::endl;
  }
  of << "=========================" << std::endl;
+  of << ">>>>>>>>>> Micro tuning stage <<<<<<<<<<<<" << std::endl;
+  for (const auto &line : micro_tuning_strage_) {
+    of << line << std::endl;
+  }
+  of << "=========================" << std::endl;
  of.close();
  return true;
 }

--- a/src/poly/tiling_utils.h
+++ b/src/poly/tiling_utils.h
@@ -32,7 +32,7 @@ enum DavinciMemScope {
  MEM_SCOPE_L0C,
  MEM_SCOPE_BULK,
 };
-enum LogStage { ANA_SCHETREE, ANA_BUF_LIVE_EXTENT, ANA_TILING_SPACE, DO_TILING, DO_TUNING };
+enum LogStage { ANA_SCHETREE, ANA_BUF_LIVE_EXTENT, ANA_TILING_SPACE, DO_TILING, DO_TUNING, MICRO_TUNING };

 class DavinciInfo {
 public:
@@ -89,6 +89,7 @@ class TileLogger {
  LogFile analyze_tiling_space_stage_;
  LogFile do_tiling_stage_;
  LogFile do_tuning_stage_;
+  LogFile micro_tuning_strage_;
 };
 }  // namespace poly
 }  // namespace ir

--- a/tests/unittest/pass/test_micro_tuning.py
+++ b/tests/unittest/pass/test_micro_tuning.py
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""unittest for micro-tuning"""
+from akg.utils import kernel_exec
+from akg.ops.array import four2five
+
+
+def test_four2five_without_custom_tiling(build_shape, dtype, op_attrs):
+    """This test case will fail without cunstom tiling and micro-tuning will automatically adjust tile sizes."""
+    build_attr = op_attrs + [False]
+    return kernel_exec.op_build_test(four2five.four2five, [build_shape], [dtype], build_attr, kernel_name="four2five", attrs={}, tuning=False)
+
+
+if __name__ == "__main__":
+    test_four2five_without_custom_tiling(
+        [32, 1001, 1, 1], "float16", ['NCHW', 'float16'])
--- a/tests/unittest/unittest.sh
+++ b/tests/unittest/unittest.sh
@@ -22,6 +22,7 @@ casefiles=(
 "pass/test_promote_if.py"
 "pass/test_sink_if.py"
 "pass/test_ir_parser.py"
+"pass/test_micro_tuning.py"
 "pass/test_elim_vector_mask.py"
 "pass/test_copy_propagation.py"
 "pass/test_utils_detect_non_linear_index.py"