提交 5a35fac5 编写于 作者: M mindspore-ci-bot 提交者: Gitee

!19 improve MulticoreStrategy in auto tiling and add related comments

Merge pull request !19 from yangsijia/fix-issue-I1L074
......@@ -608,26 +608,30 @@ void TileCandidate::DoMemInfer() {
}
}
int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
// e.g.1
// Input ir:
// for (cc0) <--- axis, dtype = float16
// for (cc1) <--- tile factor 1024, dtype = float16
// GM_BUF1[cc0, cc1] = UB_BUF1[cc0, cc1]
// for (cc0) <--- axis
// for (cc2) <--- tile factor 1024, dtype = float32
// GM_BUF2[cc0, cc2] = UB_BUF2[cc0, cc2]
// Return:
// 1024 * 2
// e.g.2
// Input ir:
// for (cc0) <--- axis, dtype = float16
// GM_BUF1[cc0] = UB_BUF1[cc0]
// Return:
// 1 * 2
int min_data_each_core = -1;
/*
* This function returns current data size moved from local buffer (UB in Davinci)
* to main memory (GM in Davinci) within target axis.
* e.g.1: target is not inner-most axis
* Input ir:
* for (cc0) <--- axis, dtype = float16
* for (cc1) <--- tile factor 1024, dtype = float16
* GM_BUF1[cc0, cc1] = UB_BUF1[cc0, cc1]
* for (cc0) <--- axis
* for (cc2) <--- tile factor 1024, dtype = float32
* GM_BUF2[cc0, cc2] = UB_BUF2[cc0, cc2]
* Return:
* min(1024 * 2(fp16), 1024 * 4(fp32)) = 1024 * 2
*
* e.g.2: target is inner-most axis
* Input ir:
* for (cc0) <--- axis, dtype = float16
* GM_BUF1[cc0] = UB_BUF1[cc0]
* Return:
* 32(ALIGN_BYTES) / 2(fp16) = 16
*/
int TileCandidate::GetDmaCopySizeWithinAxis(TileAxis *target_axis) {
std::stringstream ss;
int min_data_each_core = -1;
bool before_this_axis = true;
for (const auto &attr : analyzer_->RootAxis()->attrs) {
if (attr.attr_key.find("DMA3") == std::string::npos) {
......@@ -635,7 +639,7 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
}
int64_t data_each_core = 1;
int data_bytes = -1;
bool record = true;
bool need_record = true;
std::string gm_buf_name = attr.attr_value;
auto it = analyzer_->buf_info_.find(gm_buf_name);
if (it == analyzer_->buf_info_.end()) {
......@@ -643,32 +647,28 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
}
auto gm_buf = it->second.get();
for (auto &gm_axis : *(gm_buf->tile_axis)) {
if (gm_axis->index != axis->index) {
record = false;
if (gm_axis->index != target_axis->index || gm_axis->range_extent.as<IntImm>() == nullptr) {
need_record = false;
break;
}
if (gm_axis == axis) {
if (gm_axis == target_axis) {
before_this_axis = false;
continue;
}
if (before_this_axis) {
continue;
}
if (gm_axis->range_extent.as<IntImm>() == nullptr) {
record = false;
break;
}
int64_t l1_val = MIN_TILE;
std::tie(l1_val, std::ignore) = GetConstTileVal(gm_axis);
if (l1_val == TileVarId::VAR) {
record = false;
need_record = false;
break;
}
CHECK_NE(l1_val, 0) << "Inner axis " << gm_axis->dim_axis << " should be tile before axis " << axis->dim_axis;
CHECK_NE(l1_val, 0) << "Inner axis " << gm_axis->dim_axis << " should be tile before axis "
<< target_axis->dim_axis;
if (gm_axis->HasAnyAttr({"REDUCE_AXIS", "TRANSPOSE", "TRANSFORM"})) {
ss << "axis " << gm_axis->index << "_" << gm_axis->dim_axis << " cannot be flatten. clear data each core.";
analyzer_->logger_.AppendLog(DO_TILING, ss);
data_each_core = 1;
data_bytes = 1;
continue;
......@@ -678,19 +678,51 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) {
auto min_bytes = static_cast<int>(ALIGN_BYTES / GetMaxAlignBytes(gm_axis->data_size));
data_bytes = (data_bytes == -1 || min_bytes < data_bytes) ? min_bytes : data_bytes;
}
if (record && (min_data_each_core == -1 || data_bytes * data_each_core < min_data_each_core))
if (need_record && (min_data_each_core == -1 || data_bytes * data_each_core < min_data_each_core)) {
min_data_each_core = data_bytes * data_each_core;
}
}
ss << "[Data within axis " << axis->index << "_" << axis->dim_axis << "]: " << min_data_each_core;
ss << "[Data within axis " << target_axis->index << "_" << target_axis->dim_axis << "]: " << min_data_each_core;
analyzer_->logger_.AppendLog(DO_TILING, ss);
return min_data_each_core == -1 ? static_cast<int>(ALIGN_BYTES / GetMaxAlignBytes(axis->data_size))
return min_data_each_core == -1 ? static_cast<int>(ALIGN_BYTES / GetMaxAlignBytes(target_axis->data_size))
: min_data_each_core;
}
/*
* This function returns the minimal tile size of axis that can enable multi-core function.
* If inner-most data granularity of DMA from local buffer to main memory is less than align bytes,
* which is 32 in Davinci Core, it will disable multi-core function.
*/
int TileCandidate::GetMinFactorToEnableMulticore(TileAxis *axis) {
return std::max(static_cast<int>(ALIGN_BYTES / GetMinUbToGmDataAfterAxis(axis)), 1);
return std::max(static_cast<int>(ALIGN_BYTES / GetDmaCopySizeWithinAxis(axis)), 1);
}
/*
* This function returns the minimal tile size of axis that each core can have enough data granularity to process.
* Minimal data granularity for each core is set to 256 bytes by default and if actual data granularity is less
* than this value, the candidate tile sizes will be regarded as multi-core inefficient.
*/
int TileCandidate::GetMinFactorForMinDataGranularity(TileAxis *axis) {
auto granularity = 1;
for (auto a : this->tile_axis_) {
if (a == axis) {
continue;
}
if (!a->range_extent.as<IntImm>()) {
continue;
}
int64_t l1_val = this->GetConstTileVal(a).first;
if (l1_val == TileVarId::UNDEFINE || l1_val == TileVarId::VAR) {
continue;
}
granularity *= l1_val;
}
return std::max(static_cast<int>(MIN_MULTICORE_BYTES / granularity), 1);
}
/*
* This function returns the multiplies of loop extent of all the pending (not tiled) axes.
*/
int TileCandidate::GetMaximalPendingBlocks(TileAxis *excluded_axis) {
int64_t blocks = 1;
for (auto axis : this->tile_axis_) {
......
......@@ -380,7 +380,8 @@ class TileCandidate {
static int GetCoreNumConf();
int GetMinFactorToEnableMulticore(TileAxis *axis);
int GetMaximalPendingBlocks(TileAxis *excluded_axis);
int GetMinUbToGmDataAfterAxis(TileAxis *axis);
int GetDmaCopySizeWithinAxis(TileAxis *axis);
int GetMinFactorForMinDataGranularity(TileAxis *axis);
private:
void DoMemInfer();
......
......@@ -433,14 +433,13 @@ void GemmStrategy::AddConstraint() {
std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) {
int max_core = cand_.GetCoreNumConf();
int used_core = 1;
std::pair<int, int> proposal_range = std::make_pair(
std::max(static_cast<int>(MIN_MULTICORE_BYTES / cand_.GetMinUbToGmDataAfterAxis(multicore_axis)), 1), -1);
std::pair<int, int> proposal_range = std::make_pair(cand_.GetMinFactorForMinDataGranularity(multicore_axis), -1);
auto this_level_core = std::max(static_cast<int>(max_core / used_core), 1);
std::stringstream ss;
if (multicore_axis->range_extent.as<IntImm>() == nullptr) return proposal_range;
auto shape = multicore_axis->range_extent.as<IntImm>()->value;
bool is_last_level = false;
for (auto other_axis : cand_.GetTileAxis()) {
for (auto other_axis : this->cand_.GetTileAxis()) {
if (other_axis == multicore_axis) break;
if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue;
if (other_axis->range_extent.as<IntImm>() == nullptr) return proposal_range;
......@@ -480,6 +479,7 @@ std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis
logger_.AppendLog(DO_TILING, ss);
return proposal_range;
}
int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *multicore_axis, int64_t tiling_factor) {
CHECK_GT(tiling_factor, 0) << "tiling factor cant be zero or negative";
auto proposal_range = GetProposalRangeForFullMulticore(multicore_axis);
......@@ -488,12 +488,19 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
auto origin_factor = tiling_factor;
std::stringstream ss;
if ((!multicore_axis->mc_sup) || (multicore_axis->HasAttr("REDUCE_AXIS")) ||
(tiling_factor < cand_.GetMinFactorToEnableMulticore(multicore_axis) ||
(tiling_factor == max_factor_for_full_cores) || (max_factor_for_full_cores <= 0))) {
if ((!multicore_axis->mc_sup) || (multicore_axis->HasAttr("REDUCE_AXIS") || (max_factor_for_full_cores <= 0))) {
logger_.AppendLine(DO_TILING, "This axis is not suitable for multicore, return.");
return origin_factor;
}
if (tiling_factor < cand_.GetMinFactorToEnableMulticore(multicore_axis)) {
logger_.AppendLine(DO_TILING, "Inner-most tile size is smaller than 32 bytes, multicore is disable, return.");
return origin_factor;
}
if ((tiling_factor <= min_factor_for_enough_data) ||
(min_factor_for_enough_data >= cand_.GetCoreNumConf() * max_factor_for_full_cores)) {
logger_.AppendLine(DO_TILING, "Cannot increase degree of parallelism by adjusting current tiling factor, return.");
return origin_factor;
}
auto CheckConstConstraint = [this, &ss](Expr constraint) {
if (constraint.as<IntImm>() == nullptr) {
......@@ -505,18 +512,27 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
CheckConstConstraint(multicore_axis->l1_constraints.tile_min_);
CheckConstConstraint(multicore_axis->l1_constraints.tile_mod_);
auto pending_blocks = cand_.GetMaximalPendingBlocks(multicore_axis);
if (tiling_factor < max_factor_for_full_cores) {
auto end = static_cast<int>(sqrt(max_factor_for_full_cores));
while (max_factor_for_full_cores % tiling_factor != 0 && tiling_factor > end) --tiling_factor;
} else {
while (max_factor_for_full_cores % tiling_factor != 0 && tiling_factor > end) {
--tiling_factor;
}
} else if (max_factor_for_full_cores >= min_factor_for_enough_data) {
tiling_factor = max_factor_for_full_cores;
} else if (max_factor_for_full_cores < min_factor_for_enough_data) {
// In this case, simply adjusting tiling factor to max_factor_for_full_core may lead to insufficient data
// in each core while adjusting tiling factor to min_factor_for_enough_date may lead to fewer parallel cores.
// Since pending blocks can compensate data in each core, we make decision upon on its value.
tiling_factor = pending_blocks >= static_cast<int>(min_factor_for_enough_data / max_factor_for_full_cores)
? max_factor_for_full_cores
: min_factor_for_enough_data;
}
auto shape = multicore_axis->range_extent.as<IntImm>()->value;
bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0);
auto multicore_shrink_limit = 2;
auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor);
auto pending_blocks = cand_.GetMaximalPendingBlocks(multicore_axis);
if ((static_cast<int>(origin_factor / tiling_factor) >= multicore_shrink_limit) && reduced_mem > pending_blocks) {
ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;"
<< " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient.";
......
......@@ -192,12 +192,12 @@ class MulticoreStrategy {
MulticoreStrategy(TileCandidate &cand, const std::string log_file)
: cand_(cand), logger_(TileLogger::GetInstance(log_file)) {}
~MulticoreStrategy() {}
std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis);
int64_t AdjustTilingAccordingToMulticoreConstraint(TileAxis *axis, int64_t tiling_factor);
private:
TileCandidate &cand_;
TileLogger &logger_;
std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis);
};
} // namespace poly
} // namespace ir
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册