提交 d920ab32 编写于 作者: D dabaiji

add core number and core granularity balance model

上级 225139a0
...@@ -717,7 +717,7 @@ int TileCandidate::GetMinFactorForMinDataGranularity(TileAxis *axis) { ...@@ -717,7 +717,7 @@ int TileCandidate::GetMinFactorForMinDataGranularity(TileAxis *axis) {
} }
granularity *= l1_val; granularity *= l1_val;
} }
return std::max(static_cast<int>(MIN_MULTICORE_BYTES / granularity), 1); return std::max(static_cast<int>(MIN_CORE_GRANULARITY / granularity), 1);
} }
/* /*
......
...@@ -52,7 +52,9 @@ constexpr auto DUMP_LEVEL_TUNING = 3; ...@@ -52,7 +52,9 @@ constexpr auto DUMP_LEVEL_TUNING = 3;
constexpr auto DUMP_LINE_BREAK_NUM = 100; constexpr auto DUMP_LINE_BREAK_NUM = 100;
constexpr auto GEN_PRIME_NUM = 32; constexpr auto GEN_PRIME_NUM = 32;
constexpr auto VECTORIZE_BYTE = 256; constexpr auto VECTORIZE_BYTE = 256;
constexpr auto MIN_MULTICORE_BYTES = 256; constexpr auto MAX_REPEAT = 255;
constexpr auto MIN_CORE_GRANULARITY = 256;
constexpr auto DESIRE_CORE_GRANULARITY = 8192; // 256 Bytes * 64 repeat
// Controlled by custom tiling. // Controlled by custom tiling.
constexpr auto ALLOCATION_PERCENTAGE = 0.5; // reserved for double buffer in default constexpr auto ALLOCATION_PERCENTAGE = 0.5; // reserved for double buffer in default
......
...@@ -426,19 +426,47 @@ void GemmStrategy::AddConstraint() { ...@@ -426,19 +426,47 @@ void GemmStrategy::AddConstraint() {
} }
} }
std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) { // Adjust max core for element-wise and inner-most reduction operations to balance core number and granularity.
int MulticoreStrategy::GetProposalCoreNum() {
int max_core = cand_.GetCoreNumConf(); int max_core = cand_.GetCoreNumConf();
int problem_size = 1;
for (auto axis : this->cand_.GetTileAxis()) {
if (axis->range_extent.as<IntImm>() == nullptr) {
return 0;
}
if ((axis->HasAttr("TRANSFORM")) || (axis->HasAttr("TRANSPOSE")) ||
(axis->HasAttr("REDUCE_AXIS") && !axis->HasAttr("REDUCE_SRC_LAST"))) {
return max_core;
}
problem_size *= axis->range_extent.as<IntImm>()->value;
}
if (problem_size < max_core * MIN_CORE_GRANULARITY * MAX_REPEAT) {
max_core = static_cast<int>(problem_size / DESIRE_CORE_GRANULARITY);
if (max_core > 2 && max_core % 2 != 0) {
max_core--;
}
}
return max_core;
}
std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) {
int max_core = GetProposalCoreNum();
int used_core = 1; int used_core = 1;
std::pair<int, int> proposal_range = std::make_pair(cand_.GetMinFactorForMinDataGranularity(multicore_axis), -1); std::pair<int, int> proposal_range = std::make_pair(cand_.GetMinFactorForMinDataGranularity(multicore_axis), -1);
auto this_level_core = std::max(static_cast<int>(max_core / used_core), 1); auto this_level_core = std::max(static_cast<int>(max_core / used_core), 1);
std::stringstream ss; std::stringstream ss;
if (multicore_axis->range_extent.as<IntImm>() == nullptr) return proposal_range; if (multicore_axis->range_extent.as<IntImm>() == nullptr || this_level_core <= 1) {
return proposal_range;
}
auto shape = multicore_axis->range_extent.as<IntImm>()->value; auto shape = multicore_axis->range_extent.as<IntImm>()->value;
bool is_last_level = false; bool is_last_level = false;
for (auto other_axis : this->cand_.GetTileAxis()) { for (auto other_axis : this->cand_.GetTileAxis()) {
if (other_axis == multicore_axis) break; if (other_axis == multicore_axis) break;
if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue; if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue;
if (other_axis->range_extent.as<IntImm>() == nullptr) return proposal_range;
int64_t l1_val = TileVarId::UNDEFINE; int64_t l1_val = TileVarId::UNDEFINE;
std::tie(l1_val, std::ignore) = cand_.GetConstTileVal(other_axis); std::tie(l1_val, std::ignore) = cand_.GetConstTileVal(other_axis);
if (l1_val == TileVarId::VAR) return proposal_range; if (l1_val == TileVarId::VAR) return proposal_range;
...@@ -529,7 +557,7 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis * ...@@ -529,7 +557,7 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0); bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0);
auto multicore_shrink_limit = 2; auto multicore_shrink_limit = 2;
auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor); auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor);
if ((static_cast<int>(origin_factor / tiling_factor) >= multicore_shrink_limit) && reduced_mem > pending_blocks) { if ((static_cast<int>(origin_factor / tiling_factor) > multicore_shrink_limit) && reduced_mem > pending_blocks) {
ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;" ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;"
<< " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient."; << " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient.";
logger_.AppendLog(DO_TILING, ss); logger_.AppendLog(DO_TILING, ss);
......
...@@ -199,6 +199,7 @@ class MulticoreStrategy { ...@@ -199,6 +199,7 @@ class MulticoreStrategy {
TileCandidate &cand_; TileCandidate &cand_;
TileLogger &logger_; TileLogger &logger_;
std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis); std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis);
int GetProposalCoreNum();
}; };
class TilingPriorityScorer { class TilingPriorityScorer {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册