提交 44ef36bd 编写于 作者: D dabaiji

add tiling priority scoring model

上级 5a35fac5
......@@ -1303,12 +1303,7 @@ int TilingAnalyzer::GetNumOfAxisInBand(int band_idx) const {
return max + 1;
}
void TilingAnalyzer::TileSpaceAnalyze() {
CHECK(scop_);
SpaceAnalyzer space_analyzer(this);
space_analyzer.AnalyzeSpecialAxes();
void TilingAnalyzer::AddTilingConstraints() {
std::vector<TilingStrategy *> actived_strategies;
PassDownAttrStrategy pd_attr_strategy(this);
......@@ -1351,15 +1346,10 @@ void TilingAnalyzer::TileSpaceAnalyze() {
TilingStrategyManager &strategy_manager = TilingStrategyManager::GetInstance();
strategy_manager.SetStrategies(actived_strategies);
strategy_manager.Execute();
logger_.AppendLine(ANA_TILING_SPACE, "After adding constraints =======>");
auto PrintAttr = [&](TileAxis *a) -> void {
if (a != nullptr) a->DumpAxis();
};
ForEachAxisTopDown(PrintAttr);
logger_.AppendLine(ANA_TILING_SPACE, "<=============");
}
bool TilingAnalyzer::Prepare() {
// Stage 1: Analyze schedule tree.
ScheduleTreeAnalyzer sch_ana(this, this->sch_);
root_axis_ = sch_ana.Build(this->Halide());
if (root_axis_ == nullptr) {
......@@ -1368,25 +1358,42 @@ bool TilingAnalyzer::Prepare() {
if (root_axis_->children.empty()) {
return false;
}
auto build_axis_map = [this](const TileAxis *a) {
auto BuildAxisMap = [this](const TileAxis *a) {
for (auto loop : a->loops) {
CHECK(loop) << "Tile axis has null ptr loop, check";
this->tile_axis_[loop] = const_cast<TileAxis *>(a);
}
};
this->ForEachAxisTopDown(build_axis_map);
this->ForEachAxisTopDown(BuildAxisMap);
if (op_type_ != VECTOR_OP) {
sch_ana.AnalyzeCubeInfo();
}
TileSpaceAnalyze();
// Stage 2: Analyze Halide IR and add tiling constraints.
SpaceAnalyzer space_analyzer(this);
space_analyzer.AnalyzeSpecialAxes();
AddTilingConstraints();
// Stage 3: Analyze buffer footprint.
LinearAccessPatternBuilder lap_bdr(this);
lap_bdr.Build(body_);
linear_seq_ = std::move(lap_bdr.seq_);
buf_info_ = std::move(lap_bdr.buf_);
buffer_usage_timetable_ = std::move(lap_bdr.buffer_usage_timetable_);
// Stage 4: Set tiling priority based on previous analysis.
TilingPriorityScorer scroer(*this);
scroer.SetPriorityByScoring();
// Logging
logger_.AppendLine(ANA_TILING_SPACE, "After adding constraints =======>");
auto PrintAttr = [&](TileAxis *a) -> void {
if (a != nullptr) a->DumpAxis();
};
ForEachAxisTopDown(PrintAttr);
logger_.AppendLine(ANA_TILING_SPACE, "<=============");
DumpLinearSeq();
return true;
}
......
......@@ -288,7 +288,7 @@ class TilingAnalyzer {
std::unordered_map<std::string, std::shared_ptr<BufferEntry>> buf_info_;
private:
void TileSpaceAnalyze();
void AddTilingConstraints();
std::unique_ptr<TileAxis> root_axis_;
};
......
......@@ -42,7 +42,6 @@ void TilingSolver::CollectTileAxisTopDown() {
}
this->cand_.InsertAxisBack(a);
};
this->cand_.ResetTileAxis();
this->analyzer_.ForEachAxisTopDown(CollectTileAxis);
this->cand_.SortByPriority();
......@@ -97,9 +96,11 @@ TileCandidate *InequalitySolver::Solve() {
auto tile_band_size = static_cast<int>(analyzer_.RootAxis()->children.size());
for (auto band = 0; band < tile_band_size; ++band) {
tiling_band_ = band;
CollectTileAxisTopDown();
InitTileAxis(LEVEL1);
if (analyzer_.op_type_ != VECTOR_OP) {
InitTileAxis(LEVEL0);
}
......@@ -738,9 +739,11 @@ void DynamicShapeSolver::AppendTileConstraintInIR(TileCandidate *cand, TileLevel
TileCandidate *TraverseSolver::Solve() {
CollectMemoryLimit();
auto tile_band_size = static_cast<int>(analyzer_.RootAxis()->children.size());
for (auto band = 0; band < tile_band_size; ++band) {
tiling_band_ = band;
CollectTileAxisTopDown();
// tile all axis top down
......
......@@ -14,7 +14,7 @@
* limitations under the License.
*/
#include "poly/tiling_strategy_manager.h"
#include <numeric>
#include <iostream>
namespace akg {
......@@ -222,13 +222,9 @@ void ReduceStrategy::AddConstraint() {
if (align_elem == block_size) {
axis->l1_constraints.tile_min_ = align_elem;
} else {
axis->priority += 1;
axis->forbid_iso = true;
}
}
for (auto axis : analyzer_->GetAxesOfAttr("REDUCE_SRC_LAST")) {
axis->priority += 1;
}
}
void VectorizedStrategy::AddConstraint() {
......@@ -553,6 +549,104 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
return (valid && efficient) ? tiling_factor : origin_factor;
}
void TilingPriorityScorer::SetPriorityByScoring() {
std::stringstream ss;
for (int band_idx = 0; band_idx < static_cast<int>(analyzer_.RootAxis()->children.size()); ++band_idx) {
std::map<double, std::vector<TileAxis *>> priority_map;
std::vector<TileAxis *> tile_axes = GetBandTileAxes(band_idx);
auto norm_range = static_cast<int>(tile_axes.size());
auto dd_scores = MinMaxScaler(ComputeTileDependency(tile_axes), norm_range);
auto pl_scores = MinMaxScaler(ComputeParallelism(tile_axes), norm_range);
auto vec_scores = MinMaxScaler(ComputeVectorization(tile_axes), norm_range);
bool has_custom_priority = false;
int default_priority = -1;
for (int i = 0; i < static_cast<int>(tile_axes.size()); ++i) {
auto axis = tile_axes[i];
if (axis->priority != default_priority) {
has_custom_priority = true;
break;
}
ss << "Axis " << axis->index << " , " << axis->dim_axis << ": ";
auto total_score = (weight_.tile_dependency * dd_scores[i] + weight_.parallelism * pl_scores[i] +
weight_.vectorization * vec_scores[i]) /
weight_.Sum();
ss << "score = (tile dependency) " << weight_.tile_dependency << "*" << dd_scores[i] << " + (parallelism) "
<< weight_.parallelism << " * " << pl_scores[i] << " + (vectorization) " << weight_.vectorization << " * "
<< vec_scores[i] << " / " << weight_.Sum() << " = " << total_score;
logger_.AppendLog(DO_TILING, ss);
if (priority_map.find(total_score) == priority_map.end()) {
priority_map[total_score] = {axis};
} else {
priority_map[total_score].emplace_back(axis);
}
}
if (has_custom_priority) {
continue;
}
int priority = static_cast<int>(tile_axes.size()) - 1;
for (auto it : priority_map) {
for (auto a : it.second) {
a->priority = priority;
priority -= 1;
}
}
}
}
std::vector<double> TilingPriorityScorer::ComputeTileDependency(std::vector<TileAxis *> tile_axes) {
std::vector<double> scores;
scores.reserve(tile_axes.size());
for (auto axis : tile_axes) {
scores.emplace_back((axis->dim_axis + 1) * axis->HasAttr("REDUCE_AXIS"));
}
return scores;
}
std::vector<double> TilingPriorityScorer::ComputeParallelism(std::vector<TileAxis *> tile_axes) {
std::vector<double> scores;
scores.reserve(tile_axes.size());
for (auto axis : tile_axes) {
scores.emplace_back(!axis->mc_sup);
}
return scores;
}
std::vector<double> TilingPriorityScorer::ComputeVectorization(std::vector<TileAxis *> tile_axes) {
std::vector<double> scores;
scores.reserve(tile_axes.size());
std::unordered_map<DavinciMemScope, int> coef_map = {
{DavinciMemScope::MEM_SCOPE_GM, 2}, // continuous dma copy is considered as the most important factor
{DavinciMemScope::MEM_SCOPE_UB, 1}, // vectorization instruction is also important
{DavinciMemScope::MEM_SCOPE_L1, 0}, // does not consider impact of L1 dma copy
{DavinciMemScope::MEM_SCOPE_L0A, 0}, // does not consider impact of L0 dma copy
{DavinciMemScope::MEM_SCOPE_L0B, 0}, {DavinciMemScope::MEM_SCOPE_L0C, 0},
};
for (auto axis : tile_axes) {
int vec_level = 0;
for (auto it : analyzer_.buf_info_) {
auto buf = it.second.get();
auto coef = coef_map[buf->scope];
int dim_depth = 1;
for (auto &a : *(buf->tile_axis)) {
if (a == axis) {
vec_level += coef * dim_depth;
break;
}
dim_depth += 1;
}
}
scores.emplace_back(vec_level);
}
return scores;
}
} // namespace poly
} // namespace ir
} // namespace akg
......@@ -17,6 +17,7 @@
#define POLY_TILING_STRATEGY_MANAGER_H_
#include <iostream>
#include <algorithm>
#include "poly/tiling_analyzer.h"
......@@ -199,6 +200,95 @@ class MulticoreStrategy {
TileLogger &logger_;
std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis);
};
class TilingPriorityScorer {
public:
TilingPriorityScorer(TilingAnalyzer &analyzer)
: analyzer_(analyzer), logger_(TileLogger::GetInstance(analyzer.logger_.GetDumpDir())) {}
~TilingPriorityScorer() {}
/*
* Compute a total score of priority for each tile axis considering all related features and corresponding weights.
* Tile axis with higher score will have higher tiling priority (i.e. have more memory space).
* Note that score of each feature is standardlised into range [1, tile_axis_size].
*/
void SetPriorityByScoring();
void SetParallelismWeight(const int parallelism) { weight_.parallelism = parallelism; }
void SetVectorizationWeight(const int vectorization) { weight_.vectorization = vectorization; }
void SetDataReuseWeight(const int tile_dependency) { weight_.tile_dependency = tile_dependency; }
private:
TilingAnalyzer &analyzer_;
TileLogger &logger_;
/*
* Weight parameters for each feature in priority score model.
* Initial weights are set empirically and changing they can support micro-tuning.
*/
struct Weight {
int parallelism{1}; // get lowest weight because coincident may not always trustable
int tile_dependency{2};
int vectorization{3};
int Sum() { return parallelism + vectorization + tile_dependency; }
} weight_;
/*
* Parallelism is computed by checking coincident value in schedule tree for corresponding axis.
* If an axis can be parallelised, the parallelism score is 0; otherwise it is 1.
*/
std::vector<double> ComputeParallelism(std::vector<TileAxis *> tile_axes);
/*
* Tile dependency describes the relationship between tile axes: if more tile axes are dependended on one tile axis,
* this tile axis will have higher tile dependency score and gets higher priority during tiling.
* For example, reduce axis is usually depended by other axes and thus it should be put into local buffer first.
*/
std::vector<double> ComputeTileDependency(std::vector<TileAxis *> tile_axes);
/*
* Vectorization is computed by accumulating the dimension index of corresponding axis on each buffer.
* If an axis is related with more innermost dimensions of different buffers, the vectorization score is higher.
*/
std::vector<double> ComputeVectorization(std::vector<TileAxis *> tile_axes);
/*
* Normalize data to range [1, range_max].
* `range_max` is usually set to the size of tile axes that need to determine priority.
*/
std::vector<double> MinMaxScaler(std::vector<double> data, int range_max = 1) {
auto min = *min_element(data.begin(), data.end());
auto max = *max_element(data.begin(), data.end());
std::stringstream ss;
ss << "Min: " << min << ", Max: " << max;
logger_.AppendLog(DO_TILING, ss);
std::vector<double> scaled_data(data.size(), 1);
if (max - min == 0) {
return scaled_data;
}
for (int i = 0; i < static_cast<int>(data.size()); ++i) {
auto old_d = data[i];
ss << "Orginal data: " << old_d;
auto new_d = (old_d - min) / (max - min);
new_d = range_max > 1 ? (new_d * (range_max - 1) + 1) : new_d;
ss << " -> Scaled data: " << new_d;
scaled_data[i] = new_d;
logger_.AppendLog(DO_TILING, ss);
}
return scaled_data;
}
std::vector<TileAxis *> GetBandTileAxes(int band_idx) {
std::vector<TileAxis *> tile_axes;
auto Collect = [&tile_axes, band_idx](TileAxis *axis) {
if (axis->index == band_idx) {
tile_axes.emplace_back(axis);
}
};
analyzer_.ForEachAxisTopDown(Collect);
return tile_axes;
}
};
} // namespace poly
} // namespace ir
} // namespace akg
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册