提交 da939f82 编写于 作者: D dabaiji

improve MulticoreStrategy in auto tiling and add related comments

上级 843b6771
...@@ -608,26 +608,30 @@ void TileCandidate::DoMemInfer() { ...@@ -608,26 +608,30 @@ void TileCandidate::DoMemInfer() {
} }
} }
int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) { /*
// e.g.1 * This function returns current data size moved from local buffer (UB in Davinci)
// Input ir: * to main memory (GM in Davinci) within target axis.
// for (cc0) <--- axis, dtype = float16 * e.g.1: target is not inner-most axis
// for (cc1) <--- tile factor 1024, dtype = float16 * Input ir:
// GM_BUF1[cc0, cc1] = UB_BUF1[cc0, cc1] * for (cc0) <--- axis, dtype = float16
// for (cc0) <--- axis * for (cc1) <--- tile factor 1024, dtype = float16
// for (cc2) <--- tile factor 1024, dtype = float32 * GM_BUF1[cc0, cc1] = UB_BUF1[cc0, cc1]
// GM_BUF2[cc0, cc2] = UB_BUF2[cc0, cc2] * for (cc0) <--- axis
// Return: * for (cc2) <--- tile factor 1024, dtype = float32
// 1024 * 2 * GM_BUF2[cc0, cc2] = UB_BUF2[cc0, cc2]
// e.g.2 * Return:
// Input ir: * min(1024 * 2(fp16), 1024 * 4(fp32)) = 1024 * 2
// for (cc0) <--- axis, dtype = float16 *
// GM_BUF1[cc0] = UB_BUF1[cc0] * e.g.2: target is inner-most axis
// Return: * Input ir:
// 1 * 2 * for (cc0) <--- axis, dtype = float16
int min_data_each_core = -1; * GM_BUF1[cc0] = UB_BUF1[cc0]
* Return:
* 32(ALIGN_BYTES) / 2(fp16) = 16
*/
int TileCandidate::GetDmaCopySizeWithinAxis(TileAxis *target_axis) {
std::stringstream ss; std::stringstream ss;
int min_data_each_core = -1;
bool before_this_axis = true; bool before_this_axis = true;
for (const auto &attr : analyzer_->RootAxis()->attrs) { for (const auto &attr : analyzer_->RootAxis()->attrs) {
if (attr.attr_key.find("DMA3") == std::string::npos) { if (attr.attr_key.find("DMA3") == std::string::npos) {
...@@ -635,7 +639,7 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) { ...@@ -635,7 +639,7 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
} }
int64_t data_each_core = 1; int64_t data_each_core = 1;
int data_bytes = -1; int data_bytes = -1;
bool record = true; bool need_record = true;
std::string gm_buf_name = attr.attr_value; std::string gm_buf_name = attr.attr_value;
auto it = analyzer_->buf_info_.find(gm_buf_name); auto it = analyzer_->buf_info_.find(gm_buf_name);
if (it == analyzer_->buf_info_.end()) { if (it == analyzer_->buf_info_.end()) {
...@@ -643,32 +647,28 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) { ...@@ -643,32 +647,28 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
} }
auto gm_buf = it->second.get(); auto gm_buf = it->second.get();
for (auto &gm_axis : *(gm_buf->tile_axis)) { for (auto &gm_axis : *(gm_buf->tile_axis)) {
if (gm_axis->index != axis->index) { if (gm_axis->index != target_axis->index || gm_axis->range_extent.as<IntImm>() == nullptr) {
record = false; need_record = false;
break; break;
} }
if (gm_axis == axis) { if (gm_axis == target_axis) {
before_this_axis = false; before_this_axis = false;
continue; continue;
} }
if (before_this_axis) { if (before_this_axis) {
continue; continue;
} }
if (gm_axis->range_extent.as<IntImm>() == nullptr) {
record = false;
break;
}
int64_t l1_val = MIN_TILE; int64_t l1_val = MIN_TILE;
std::tie(l1_val, std::ignore) = GetConstTileVal(gm_axis); std::tie(l1_val, std::ignore) = GetConstTileVal(gm_axis);
if (l1_val == TileVarId::VAR) { if (l1_val == TileVarId::VAR) {
record = false; need_record = false;
break; break;
} }
CHECK_NE(l1_val, 0) << "Inner axis " << gm_axis->dim_axis << " should be tile before axis " << axis->dim_axis; CHECK_NE(l1_val, 0) << "Inner axis " << gm_axis->dim_axis << " should be tile before axis "
<< target_axis->dim_axis;
if (gm_axis->HasAnyAttr({"REDUCE_AXIS", "TRANSPOSE", "TRANSFORM"})) { if (gm_axis->HasAnyAttr({"REDUCE_AXIS", "TRANSPOSE", "TRANSFORM"})) {
ss << "axis " << gm_axis->index << "_" << gm_axis->dim_axis << " cannot be flatten. clear data each core."; ss << "axis " << gm_axis->index << "_" << gm_axis->dim_axis << " cannot be flatten. clear data each core.";
analyzer_->logger_.AppendLog(DO_TILING, ss); analyzer_->logger_.AppendLog(DO_TILING, ss);
data_each_core = 1; data_each_core = 1;
data_bytes = 1; data_bytes = 1;
continue; continue;
...@@ -678,19 +678,51 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) { ...@@ -678,19 +678,51 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
auto min_bytes = static_cast<int>(ALIGN_BYTES / GetMaxAlignBytes(gm_axis->data_size)); auto min_bytes = static_cast<int>(ALIGN_BYTES / GetMaxAlignBytes(gm_axis->data_size));
data_bytes = (data_bytes == -1 || min_bytes < data_bytes) ? min_bytes : data_bytes; data_bytes = (data_bytes == -1 || min_bytes < data_bytes) ? min_bytes : data_bytes;
} }
if (record && (min_data_each_core == -1 || data_bytes * data_each_core < min_data_each_core)) if (need_record && (min_data_each_core == -1 || data_bytes * data_each_core < min_data_each_core)) {
min_data_each_core = data_bytes * data_each_core; min_data_each_core = data_bytes * data_each_core;
} }
ss << "[Data within axis " << axis->index << "_" << axis->dim_axis << "]: " << min_data_each_core; }
ss << "[Data within axis " << target_axis->index << "_" << target_axis->dim_axis << "]: " << min_data_each_core;
analyzer_->logger_.AppendLog(DO_TILING, ss); analyzer_->logger_.AppendLog(DO_TILING, ss);
return min_data_each_core == -1 ? static_cast<int>(ALIGN_BYTES / GetMaxAlignBytes(axis->data_size)) return min_data_each_core == -1 ? static_cast<int>(ALIGN_BYTES / GetMaxAlignBytes(target_axis->data_size))
: min_data_each_core; : min_data_each_core;
} }
/*
* This function returns the minimal tile size of axis that can enable multi-core function.
* If inner-most data granularity of DMA from local buffer to main memory is less than align bytes,
* which is 32 in Davinci Core, it will disable multi-core function.
*/
int TileCandidate::GetMinFactorToEnableMulticore(TileAxis *axis) { int TileCandidate::GetMinFactorToEnableMulticore(TileAxis *axis) {
return std::max(static_cast<int>(ALIGN_BYTES / GetMinUbToGmDataAfterAxis(axis)), 1); return std::max(static_cast<int>(ALIGN_BYTES / GetDmaCopySizeWithinAxis(axis)), 1);
}
/*
* This function returns the minimal tile size of axis that each core can have enough data granularity to process.
* Minimal data granularity for each core is set to 256 bytes by default and if actual data granularity is less
* than this value, the candidate tile sizes will be regarded as multi-core inefficient.
*/
int TileCandidate::GetMinFactorForMinDataGranularity(TileAxis *axis) {
auto granularity = 1;
for (auto a : this->tile_axis_) {
if (a == axis) {
continue;
}
if (!a->range_extent.as<IntImm>()) {
continue;
}
int64_t l1_val = this->GetConstTileVal(a).first;
if (l1_val == TileVarId::UNDEFINE || l1_val == TileVarId::VAR) {
continue;
}
granularity *= l1_val;
}
return std::max(static_cast<int>(MIN_MULTICORE_BYTES / granularity), 1);
} }
/*
* This function returns the multiplies of loop extent of all the pending (not tiled) axes.
*/
int TileCandidate::GetMaximalPendingBlocks(TileAxis *excluded_axis) { int TileCandidate::GetMaximalPendingBlocks(TileAxis *excluded_axis) {
int64_t blocks = 1; int64_t blocks = 1;
for (auto axis : this->tile_axis_) { for (auto axis : this->tile_axis_) {
......
...@@ -380,7 +380,8 @@ class TileCandidate { ...@@ -380,7 +380,8 @@ class TileCandidate {
static int GetCoreNumConf(); static int GetCoreNumConf();
int GetMinFactorToEnableMulticore(TileAxis *axis); int GetMinFactorToEnableMulticore(TileAxis *axis);
int GetMaximalPendingBlocks(TileAxis *excluded_axis); int GetMaximalPendingBlocks(TileAxis *excluded_axis);
int GetMinUbToGmDataAfterAxis(TileAxis *axis); int GetDmaCopySizeWithinAxis(TileAxis *axis);
int GetMinFactorForMinDataGranularity(TileAxis *axis);
private: private:
void DoMemInfer(); void DoMemInfer();
......
...@@ -433,14 +433,13 @@ void GemmStrategy::AddConstraint() { ...@@ -433,14 +433,13 @@ void GemmStrategy::AddConstraint() {
std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) { std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) {
int max_core = cand_.GetCoreNumConf(); int max_core = cand_.GetCoreNumConf();
int used_core = 1; int used_core = 1;
std::pair<int, int> proposal_range = std::make_pair( std::pair<int, int> proposal_range = std::make_pair(cand_.GetMinFactorForMinDataGranularity(multicore_axis), -1);
std::max(static_cast<int>(MIN_MULTICORE_BYTES / cand_.GetMinUbToGmDataAfterAxis(multicore_axis)), 1), -1);
auto this_level_core = std::max(static_cast<int>(max_core / used_core), 1); auto this_level_core = std::max(static_cast<int>(max_core / used_core), 1);
std::stringstream ss; std::stringstream ss;
if (multicore_axis->range_extent.as<IntImm>() == nullptr) return proposal_range; if (multicore_axis->range_extent.as<IntImm>() == nullptr) return proposal_range;
auto shape = multicore_axis->range_extent.as<IntImm>()->value; auto shape = multicore_axis->range_extent.as<IntImm>()->value;
bool is_last_level = false; bool is_last_level = false;
for (auto other_axis : cand_.GetTileAxis()) { for (auto other_axis : this->cand_.GetTileAxis()) {
if (other_axis == multicore_axis) break; if (other_axis == multicore_axis) break;
if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue; if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue;
if (other_axis->range_extent.as<IntImm>() == nullptr) return proposal_range; if (other_axis->range_extent.as<IntImm>() == nullptr) return proposal_range;
...@@ -480,6 +479,7 @@ std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis ...@@ -480,6 +479,7 @@ std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis
logger_.AppendLog(DO_TILING, ss); logger_.AppendLog(DO_TILING, ss);
return proposal_range; return proposal_range;
} }
int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *multicore_axis, int64_t tiling_factor) { int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *multicore_axis, int64_t tiling_factor) {
CHECK_GT(tiling_factor, 0) << "tiling factor cant be zero or negative"; CHECK_GT(tiling_factor, 0) << "tiling factor cant be zero or negative";
auto proposal_range = GetProposalRangeForFullMulticore(multicore_axis); auto proposal_range = GetProposalRangeForFullMulticore(multicore_axis);
...@@ -488,12 +488,19 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis * ...@@ -488,12 +488,19 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
auto origin_factor = tiling_factor; auto origin_factor = tiling_factor;
std::stringstream ss; std::stringstream ss;
if ((!multicore_axis->mc_sup) || (multicore_axis->HasAttr("REDUCE_AXIS")) || if ((!multicore_axis->mc_sup) || (multicore_axis->HasAttr("REDUCE_AXIS") || (max_factor_for_full_cores <= 0))) {
(tiling_factor < cand_.GetMinFactorToEnableMulticore(multicore_axis) ||
(tiling_factor == max_factor_for_full_cores) || (max_factor_for_full_cores <= 0))) {
logger_.AppendLine(DO_TILING, "This axis is not suitable for multicore, return."); logger_.AppendLine(DO_TILING, "This axis is not suitable for multicore, return.");
return origin_factor; return origin_factor;
} }
if (tiling_factor < cand_.GetMinFactorToEnableMulticore(multicore_axis)) {
logger_.AppendLine(DO_TILING, "Inner-most tile size is smaller than 32 bytes, multicore is disable, return.");
return origin_factor;
}
if ((tiling_factor <= min_factor_for_enough_data) ||
(min_factor_for_enough_data >= cand_.GetCoreNumConf() * max_factor_for_full_cores)) {
logger_.AppendLine(DO_TILING, "Cannot increase degree of parallelism by adjusting current tiling factor, return.");
return origin_factor;
}
auto CheckConstConstraint = [this, &ss](Expr constraint) { auto CheckConstConstraint = [this, &ss](Expr constraint) {
if (constraint.as<IntImm>() == nullptr) { if (constraint.as<IntImm>() == nullptr) {
...@@ -505,18 +512,27 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis * ...@@ -505,18 +512,27 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
CheckConstConstraint(multicore_axis->l1_constraints.tile_min_); CheckConstConstraint(multicore_axis->l1_constraints.tile_min_);
CheckConstConstraint(multicore_axis->l1_constraints.tile_mod_); CheckConstConstraint(multicore_axis->l1_constraints.tile_mod_);
auto pending_blocks = cand_.GetMaximalPendingBlocks(multicore_axis);
if (tiling_factor < max_factor_for_full_cores) { if (tiling_factor < max_factor_for_full_cores) {
auto end = static_cast<int>(sqrt(max_factor_for_full_cores)); auto end = static_cast<int>(sqrt(max_factor_for_full_cores));
while (max_factor_for_full_cores % tiling_factor != 0 && tiling_factor > end) --tiling_factor; while (max_factor_for_full_cores % tiling_factor != 0 && tiling_factor > end) {
} else { --tiling_factor;
}
} else if (max_factor_for_full_cores >= min_factor_for_enough_data) {
tiling_factor = max_factor_for_full_cores; tiling_factor = max_factor_for_full_cores;
} else if (max_factor_for_full_cores < min_factor_for_enough_data) {
// In this case, simply adjusting tiling factor to max_factor_for_full_core may lead to insufficient data
// in each core while adjusting tiling factor to min_factor_for_enough_date may lead to fewer parallel cores.
// Since pending blocks can compensate data in each core, we make decision upon on its value.
tiling_factor = pending_blocks >= static_cast<int>(min_factor_for_enough_data / max_factor_for_full_cores)
? max_factor_for_full_cores
: min_factor_for_enough_data;
} }
auto shape = multicore_axis->range_extent.as<IntImm>()->value; auto shape = multicore_axis->range_extent.as<IntImm>()->value;
bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0); bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0);
auto multicore_shrink_limit = 2; auto multicore_shrink_limit = 2;
auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor); auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor);
auto pending_blocks = cand_.GetMaximalPendingBlocks(multicore_axis);
if ((static_cast<int>(origin_factor / tiling_factor) >= multicore_shrink_limit) && reduced_mem > pending_blocks) { if ((static_cast<int>(origin_factor / tiling_factor) >= multicore_shrink_limit) && reduced_mem > pending_blocks) {
ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;" ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;"
<< " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient."; << " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient.";
......
...@@ -192,12 +192,12 @@ class MulticoreStrategy { ...@@ -192,12 +192,12 @@ class MulticoreStrategy {
MulticoreStrategy(TileCandidate &cand, const std::string log_file) MulticoreStrategy(TileCandidate &cand, const std::string log_file)
: cand_(cand), logger_(TileLogger::GetInstance(log_file)) {} : cand_(cand), logger_(TileLogger::GetInstance(log_file)) {}
~MulticoreStrategy() {} ~MulticoreStrategy() {}
std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis);
int64_t AdjustTilingAccordingToMulticoreConstraint(TileAxis *axis, int64_t tiling_factor); int64_t AdjustTilingAccordingToMulticoreConstraint(TileAxis *axis, int64_t tiling_factor);
private: private:
TileCandidate &cand_; TileCandidate &cand_;
TileLogger &logger_; TileLogger &logger_;
std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis);
}; };
} // namespace poly } // namespace poly
} // namespace ir } // namespace ir
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册