add tiling priority scoring model

44ef36bd · dabaiji · 5a35fac5 · 44ef36bd · 44ef36bd · 44ef36bd
5 changed file
--- a/src/poly/tiling_analyzer.cc
+++ b/src/poly/tiling_analyzer.cc
@@ -1303,12 +1303,7 @@ int TilingAnalyzer::GetNumOfAxisInBand(int band_idx) const {
  return max + 1;
 }

-void TilingAnalyzer::TileSpaceAnalyze() {
-  CHECK(scop_);
-
-  SpaceAnalyzer space_analyzer(this);
-  space_analyzer.AnalyzeSpecialAxes();
-
+void TilingAnalyzer::AddTilingConstraints() {
  std::vector<TilingStrategy *> actived_strategies;

  PassDownAttrStrategy pd_attr_strategy(this);
@@ -1351,15 +1346,10 @@ void TilingAnalyzer::TileSpaceAnalyze() {
  TilingStrategyManager &strategy_manager = TilingStrategyManager::GetInstance();
  strategy_manager.SetStrategies(actived_strategies);
  strategy_manager.Execute();
-  logger_.AppendLine(ANA_TILING_SPACE, "After adding constraints =======>");
-  auto PrintAttr = [&](TileAxis *a) -> void {
-    if (a != nullptr) a->DumpAxis();
-  };
-  ForEachAxisTopDown(PrintAttr);
-  logger_.AppendLine(ANA_TILING_SPACE, "<=============");
 }

 bool TilingAnalyzer::Prepare() {
+  // Stage 1: Analyze schedule tree.
  ScheduleTreeAnalyzer sch_ana(this, this->sch_);
  root_axis_ = sch_ana.Build(this->Halide());
  if (root_axis_ == nullptr) {
@@ -1368,25 +1358,42 @@ bool TilingAnalyzer::Prepare() {
  if (root_axis_->children.empty()) {
    return false;
  }
-  auto build_axis_map = [this](const TileAxis *a) {
+  auto BuildAxisMap = [this](const TileAxis *a) {
    for (auto loop : a->loops) {
      CHECK(loop) << "Tile axis has null ptr loop, check";
      this->tile_axis_[loop] = const_cast<TileAxis *>(a);
    }
  };
-  this->ForEachAxisTopDown(build_axis_map);
+  this->ForEachAxisTopDown(BuildAxisMap);
  if (op_type_ != VECTOR_OP) {
    sch_ana.AnalyzeCubeInfo();
  }
-  TileSpaceAnalyze();

+  // Stage 2: Analyze Halide IR and add tiling constraints.
+  SpaceAnalyzer space_analyzer(this);
+  space_analyzer.AnalyzeSpecialAxes();
+  AddTilingConstraints();
+
+  // Stage 3: Analyze buffer footprint.
  LinearAccessPatternBuilder lap_bdr(this);
  lap_bdr.Build(body_);
  linear_seq_ = std::move(lap_bdr.seq_);
  buf_info_ = std::move(lap_bdr.buf_);
  buffer_usage_timetable_ = std::move(lap_bdr.buffer_usage_timetable_);

+  // Stage 4: Set tiling priority based on previous analysis.
+  TilingPriorityScorer scroer(*this);
+  scroer.SetPriorityByScoring();
+
+  // Logging
+  logger_.AppendLine(ANA_TILING_SPACE, "After adding constraints =======>");
+  auto PrintAttr = [&](TileAxis *a) -> void {
+    if (a != nullptr) a->DumpAxis();
+  };
+  ForEachAxisTopDown(PrintAttr);
+  logger_.AppendLine(ANA_TILING_SPACE, "<=============");
  DumpLinearSeq();
+
  return true;
 }


--- a/src/poly/tiling_analyzer.h
+++ b/src/poly/tiling_analyzer.h
@@ -288,7 +288,7 @@ class TilingAnalyzer {
  std::unordered_map<std::string, std::shared_ptr<BufferEntry>> buf_info_;

 private:
-  void TileSpaceAnalyze();
+  void AddTilingConstraints();
  std::unique_ptr<TileAxis> root_axis_;
 };


--- a/src/poly/tiling_solver.cc
+++ b/src/poly/tiling_solver.cc
@@ -42,7 +42,6 @@ void TilingSolver::CollectTileAxisTopDown() {
    }
    this->cand_.InsertAxisBack(a);
  };
-
  this->cand_.ResetTileAxis();
  this->analyzer_.ForEachAxisTopDown(CollectTileAxis);
  this->cand_.SortByPriority();
@@ -97,9 +96,11 @@ TileCandidate *InequalitySolver::Solve() {
  auto tile_band_size = static_cast<int>(analyzer_.RootAxis()->children.size());
  for (auto band = 0; band < tile_band_size; ++band) {
    tiling_band_ = band;
+
    CollectTileAxisTopDown();

    InitTileAxis(LEVEL1);
+
    if (analyzer_.op_type_ != VECTOR_OP) {
      InitTileAxis(LEVEL0);
    }
@@ -738,9 +739,11 @@ void DynamicShapeSolver::AppendTileConstraintInIR(TileCandidate *cand, TileLevel

 TileCandidate *TraverseSolver::Solve() {
  CollectMemoryLimit();
+
  auto tile_band_size = static_cast<int>(analyzer_.RootAxis()->children.size());
  for (auto band = 0; band < tile_band_size; ++band) {
    tiling_band_ = band;
+
    CollectTileAxisTopDown();

    // tile all axis top down

--- a/src/poly/tiling_strategy_manager.cc
+++ b/src/poly/tiling_strategy_manager.cc
@@ -14,7 +14,7 @@
 * limitations under the License.
 */
 #include "poly/tiling_strategy_manager.h"
-
+#include <numeric>
 #include <iostream>

 namespace akg {
@@ -222,13 +222,9 @@ void ReduceStrategy::AddConstraint() {
    if (align_elem == block_size) {
      axis->l1_constraints.tile_min_ = align_elem;
    } else {
-      axis->priority += 1;
      axis->forbid_iso = true;
    }
  }
-  for (auto axis : analyzer_->GetAxesOfAttr("REDUCE_SRC_LAST")) {
-    axis->priority += 1;
-  }
 }

 void VectorizedStrategy::AddConstraint() {
@@ -553,6 +549,104 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
  return (valid && efficient) ? tiling_factor : origin_factor;
 }

+void TilingPriorityScorer::SetPriorityByScoring() {
+  std::stringstream ss;
+  for (int band_idx = 0; band_idx < static_cast<int>(analyzer_.RootAxis()->children.size()); ++band_idx) {
+    std::map<double, std::vector<TileAxis *>> priority_map;
+    std::vector<TileAxis *> tile_axes = GetBandTileAxes(band_idx);
+
+    auto norm_range = static_cast<int>(tile_axes.size());
+    auto dd_scores = MinMaxScaler(ComputeTileDependency(tile_axes), norm_range);
+    auto pl_scores = MinMaxScaler(ComputeParallelism(tile_axes), norm_range);
+    auto vec_scores = MinMaxScaler(ComputeVectorization(tile_axes), norm_range);
+
+    bool has_custom_priority = false;
+    int default_priority = -1;
+    for (int i = 0; i < static_cast<int>(tile_axes.size()); ++i) {
+      auto axis = tile_axes[i];
+
+      if (axis->priority != default_priority) {
+        has_custom_priority = true;
+        break;
+      }
+
+      ss << "Axis " << axis->index << " , " << axis->dim_axis << ": ";
+      auto total_score = (weight_.tile_dependency * dd_scores[i] + weight_.parallelism * pl_scores[i] +
+                          weight_.vectorization * vec_scores[i]) /
+                         weight_.Sum();
+      ss << "score = (tile dependency) " << weight_.tile_dependency << "*" << dd_scores[i] << " + (parallelism) "
+         << weight_.parallelism << " * " << pl_scores[i] << " + (vectorization) " << weight_.vectorization << " * "
+         << vec_scores[i] << " / " << weight_.Sum() << " = " << total_score;
+      logger_.AppendLog(DO_TILING, ss);
+
+      if (priority_map.find(total_score) == priority_map.end()) {
+        priority_map[total_score] = {axis};
+      } else {
+        priority_map[total_score].emplace_back(axis);
+      }
+    }
+
+    if (has_custom_priority) {
+      continue;
+    }
+
+    int priority = static_cast<int>(tile_axes.size()) - 1;
+    for (auto it : priority_map) {
+      for (auto a : it.second) {
+        a->priority = priority;
+        priority -= 1;
+      }
+    }
+  }
+}
+
+std::vector<double> TilingPriorityScorer::ComputeTileDependency(std::vector<TileAxis *> tile_axes) {
+  std::vector<double> scores;
+  scores.reserve(tile_axes.size());
+  for (auto axis : tile_axes) {
+    scores.emplace_back((axis->dim_axis + 1) * axis->HasAttr("REDUCE_AXIS"));
+  }
+  return scores;
+}
+
+std::vector<double> TilingPriorityScorer::ComputeParallelism(std::vector<TileAxis *> tile_axes) {
+  std::vector<double> scores;
+  scores.reserve(tile_axes.size());
+  for (auto axis : tile_axes) {
+    scores.emplace_back(!axis->mc_sup);
+  }
+  return scores;
+}
+
+std::vector<double> TilingPriorityScorer::ComputeVectorization(std::vector<TileAxis *> tile_axes) {
+  std::vector<double> scores;
+  scores.reserve(tile_axes.size());
+  std::unordered_map<DavinciMemScope, int> coef_map = {
+    {DavinciMemScope::MEM_SCOPE_GM, 2},   // continuous dma copy is considered as the most important factor
+    {DavinciMemScope::MEM_SCOPE_UB, 1},   // vectorization instruction is also important
+    {DavinciMemScope::MEM_SCOPE_L1, 0},   // does not consider impact of L1 dma copy
+    {DavinciMemScope::MEM_SCOPE_L0A, 0},  // does not consider impact of L0 dma copy
+    {DavinciMemScope::MEM_SCOPE_L0B, 0}, {DavinciMemScope::MEM_SCOPE_L0C, 0},
+  };
+  for (auto axis : tile_axes) {
+    int vec_level = 0;
+    for (auto it : analyzer_.buf_info_) {
+      auto buf = it.second.get();
+      auto coef = coef_map[buf->scope];
+      int dim_depth = 1;
+      for (auto &a : *(buf->tile_axis)) {
+        if (a == axis) {
+          vec_level += coef * dim_depth;
+          break;
+        }
+        dim_depth += 1;
+      }
+    }
+    scores.emplace_back(vec_level);
+  }
+  return scores;
+}
+
 }  // namespace poly
 }  // namespace ir
 }  // namespace akg
--- a/src/poly/tiling_strategy_manager.h
+++ b/src/poly/tiling_strategy_manager.h
@@ -17,6 +17,7 @@
 #define POLY_TILING_STRATEGY_MANAGER_H_

 #include <iostream>
+#include <algorithm>

 #include "poly/tiling_analyzer.h"

@@ -199,6 +200,95 @@ class MulticoreStrategy {
  TileLogger &logger_;
  std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis);
 };
+
+class TilingPriorityScorer {
+ public:
+  TilingPriorityScorer(TilingAnalyzer &analyzer)
+      : analyzer_(analyzer), logger_(TileLogger::GetInstance(analyzer.logger_.GetDumpDir())) {}
+  ~TilingPriorityScorer() {}
+
+  /*
+   * Compute a total score of priority for each tile axis considering all related features and corresponding weights.
+   * Tile axis with higher score will have higher tiling priority (i.e. have more memory space).
+   * Note that score of each feature is standardlised into range [1, tile_axis_size].
+   */
+  void SetPriorityByScoring();
+
+  void SetParallelismWeight(const int parallelism) { weight_.parallelism = parallelism; }
+  void SetVectorizationWeight(const int vectorization) { weight_.vectorization = vectorization; }
+  void SetDataReuseWeight(const int tile_dependency) { weight_.tile_dependency = tile_dependency; }
+
+ private:
+  TilingAnalyzer &analyzer_;
+  TileLogger &logger_;
+
+  /*
+   * Weight parameters for each feature in priority score model.
+   * Initial weights are set empirically and changing they can support micro-tuning.
+   */
+  struct Weight {
+    int parallelism{1};  // get lowest weight because coincident may not always trustable
+    int tile_dependency{2};
+    int vectorization{3};
+    int Sum() { return parallelism + vectorization + tile_dependency; }
+  } weight_;
+
+  /*
+   * Parallelism is computed by checking coincident value in schedule tree for corresponding axis.
+   * If an axis can be parallelised, the parallelism score is 0; otherwise it is 1.
+   */
+  std::vector<double> ComputeParallelism(std::vector<TileAxis *> tile_axes);
+
+  /*
+   * Tile dependency describes the relationship between tile axes: if more tile axes are dependended on one tile axis,
+   * this tile axis will have higher tile dependency score and gets higher priority during tiling.
+   * For example, reduce axis is usually depended by other axes and thus it should be put into local buffer first.
+   */
+  std::vector<double> ComputeTileDependency(std::vector<TileAxis *> tile_axes);
+
+  /*
+   * Vectorization is computed by accumulating the dimension index of corresponding axis on each buffer.
+   * If an axis is related with more innermost dimensions of different buffers, the vectorization score is higher.
+   */
+  std::vector<double> ComputeVectorization(std::vector<TileAxis *> tile_axes);
+
+  /*
+   * Normalize data to range [1, range_max].
+   * `range_max` is usually set to the size of tile axes that need to determine priority.
+   */
+  std::vector<double> MinMaxScaler(std::vector<double> data, int range_max = 1) {
+    auto min = *min_element(data.begin(), data.end());
+    auto max = *max_element(data.begin(), data.end());
+    std::stringstream ss;
+    ss << "Min: " << min << ", Max: " << max;
+    logger_.AppendLog(DO_TILING, ss);
+    std::vector<double> scaled_data(data.size(), 1);
+    if (max - min == 0) {
+      return scaled_data;
+    }
+    for (int i = 0; i < static_cast<int>(data.size()); ++i) {
+      auto old_d = data[i];
+      ss << "Orginal data: " << old_d;
+      auto new_d = (old_d - min) / (max - min);
+      new_d = range_max > 1 ? (new_d * (range_max - 1) + 1) : new_d;
+      ss << " -> Scaled data: " << new_d;
+      scaled_data[i] = new_d;
+      logger_.AppendLog(DO_TILING, ss);
+    }
+    return scaled_data;
+  }
+
+  std::vector<TileAxis *> GetBandTileAxes(int band_idx) {
+    std::vector<TileAxis *> tile_axes;
+    auto Collect = [&tile_axes, band_idx](TileAxis *axis) {
+      if (axis->index == band_idx) {
+        tile_axes.emplace_back(axis);
+      }
+    };
+    analyzer_.ForEachAxisTopDown(Collect);
+    return tile_axes;
+  }
+};
 }  // namespace poly
 }  // namespace ir
 }  // namespace akg