diff --git a/src/poly/tiling_analyzer.cc b/src/poly/tiling_analyzer.cc index 1eb5aad1a69a8db0606c1976ed93157f04bbc185..bbc2113b7f32a1b98fd8457bb1eaf9791994cbac 100644 --- a/src/poly/tiling_analyzer.cc +++ b/src/poly/tiling_analyzer.cc @@ -608,26 +608,30 @@ void TileCandidate::DoMemInfer() { } } -int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) { - // e.g.1 - // Input ir: - // for (cc0) <--- axis, dtype = float16 - // for (cc1) <--- tile factor 1024, dtype = float16 - // GM_BUF1[cc0, cc1] = UB_BUF1[cc0, cc1] - // for (cc0) <--- axis - // for (cc2) <--- tile factor 1024, dtype = float32 - // GM_BUF2[cc0, cc2] = UB_BUF2[cc0, cc2] - // Return: - // 1024 * 2 - // e.g.2 - // Input ir: - // for (cc0) <--- axis, dtype = float16 - // GM_BUF1[cc0] = UB_BUF1[cc0] - // Return: - // 1 * 2 - int min_data_each_core = -1; - +/* + * This function returns current data size moved from local buffer (UB in Davinci) + * to main memory (GM in Davinci) within target axis. + * e.g.1: target is not inner-most axis + * Input ir: + * for (cc0) <--- axis, dtype = float16 + * for (cc1) <--- tile factor 1024, dtype = float16 + * GM_BUF1[cc0, cc1] = UB_BUF1[cc0, cc1] + * for (cc0) <--- axis + * for (cc2) <--- tile factor 1024, dtype = float32 + * GM_BUF2[cc0, cc2] = UB_BUF2[cc0, cc2] + * Return: + * min(1024 * 2(fp16), 1024 * 4(fp32)) = 1024 * 2 + * + * e.g.2: target is inner-most axis + * Input ir: + * for (cc0) <--- axis, dtype = float16 + * GM_BUF1[cc0] = UB_BUF1[cc0] + * Return: + * 32(ALIGN_BYTES) / 2(fp16) = 16 + */ +int TileCandidate::GetDmaCopySizeWithinAxis(TileAxis *target_axis) { std::stringstream ss; + int min_data_each_core = -1; bool before_this_axis = true; for (const auto &attr : analyzer_->RootAxis()->attrs) { if (attr.attr_key.find("DMA3") == std::string::npos) { @@ -635,7 +639,7 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) { } int64_t data_each_core = 1; int data_bytes = -1; - bool record = true; + bool need_record = true; std::string gm_buf_name = attr.attr_value; auto it = analyzer_->buf_info_.find(gm_buf_name); if (it == analyzer_->buf_info_.end()) { @@ -643,32 +647,28 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) { } auto gm_buf = it->second.get(); for (auto &gm_axis : *(gm_buf->tile_axis)) { - if (gm_axis->index != axis->index) { - record = false; + if (gm_axis->index != target_axis->index || gm_axis->range_extent.as() == nullptr) { + need_record = false; break; } - if (gm_axis == axis) { + if (gm_axis == target_axis) { before_this_axis = false; continue; } if (before_this_axis) { continue; } - if (gm_axis->range_extent.as() == nullptr) { - record = false; - break; - } int64_t l1_val = MIN_TILE; std::tie(l1_val, std::ignore) = GetConstTileVal(gm_axis); if (l1_val == TileVarId::VAR) { - record = false; + need_record = false; break; } - CHECK_NE(l1_val, 0) << "Inner axis " << gm_axis->dim_axis << " should be tile before axis " << axis->dim_axis; + CHECK_NE(l1_val, 0) << "Inner axis " << gm_axis->dim_axis << " should be tile before axis " + << target_axis->dim_axis; if (gm_axis->HasAnyAttr({"REDUCE_AXIS", "TRANSPOSE", "TRANSFORM"})) { ss << "axis " << gm_axis->index << "_" << gm_axis->dim_axis << " cannot be flatten. clear data each core."; analyzer_->logger_.AppendLog(DO_TILING, ss); - data_each_core = 1; data_bytes = 1; continue; @@ -678,19 +678,51 @@ int TileCandidate::GetMinUbToGmDataAfterAxis(TileAxis *axis) { auto min_bytes = static_cast(ALIGN_BYTES / GetMaxAlignBytes(gm_axis->data_size)); data_bytes = (data_bytes == -1 || min_bytes < data_bytes) ? min_bytes : data_bytes; } - if (record && (min_data_each_core == -1 || data_bytes * data_each_core < min_data_each_core)) + if (need_record && (min_data_each_core == -1 || data_bytes * data_each_core < min_data_each_core)) { min_data_each_core = data_bytes * data_each_core; + } } - ss << "[Data within axis " << axis->index << "_" << axis->dim_axis << "]: " << min_data_each_core; + ss << "[Data within axis " << target_axis->index << "_" << target_axis->dim_axis << "]: " << min_data_each_core; analyzer_->logger_.AppendLog(DO_TILING, ss); - return min_data_each_core == -1 ? static_cast(ALIGN_BYTES / GetMaxAlignBytes(axis->data_size)) + return min_data_each_core == -1 ? static_cast(ALIGN_BYTES / GetMaxAlignBytes(target_axis->data_size)) : min_data_each_core; } +/* + * This function returns the minimal tile size of axis that can enable multi-core function. + * If inner-most data granularity of DMA from local buffer to main memory is less than align bytes, + * which is 32 in Davinci Core, it will disable multi-core function. + */ int TileCandidate::GetMinFactorToEnableMulticore(TileAxis *axis) { - return std::max(static_cast(ALIGN_BYTES / GetMinUbToGmDataAfterAxis(axis)), 1); + return std::max(static_cast(ALIGN_BYTES / GetDmaCopySizeWithinAxis(axis)), 1); +} + +/* + * This function returns the minimal tile size of axis that each core can have enough data granularity to process. + * Minimal data granularity for each core is set to 256 bytes by default and if actual data granularity is less + * than this value, the candidate tile sizes will be regarded as multi-core inefficient. + */ +int TileCandidate::GetMinFactorForMinDataGranularity(TileAxis *axis) { + auto granularity = 1; + for (auto a : this->tile_axis_) { + if (a == axis) { + continue; + } + if (!a->range_extent.as()) { + continue; + } + int64_t l1_val = this->GetConstTileVal(a).first; + if (l1_val == TileVarId::UNDEFINE || l1_val == TileVarId::VAR) { + continue; + } + granularity *= l1_val; + } + return std::max(static_cast(MIN_MULTICORE_BYTES / granularity), 1); } +/* + * This function returns the multiplies of loop extent of all the pending (not tiled) axes. + */ int TileCandidate::GetMaximalPendingBlocks(TileAxis *excluded_axis) { int64_t blocks = 1; for (auto axis : this->tile_axis_) { diff --git a/src/poly/tiling_analyzer.h b/src/poly/tiling_analyzer.h index 59f4eb8cac39070d873c5e4c58220a8a3f4801b9..31b340e5cb6f93f36d420e2e8722c08a0119784f 100644 --- a/src/poly/tiling_analyzer.h +++ b/src/poly/tiling_analyzer.h @@ -380,7 +380,8 @@ class TileCandidate { static int GetCoreNumConf(); int GetMinFactorToEnableMulticore(TileAxis *axis); int GetMaximalPendingBlocks(TileAxis *excluded_axis); - int GetMinUbToGmDataAfterAxis(TileAxis *axis); + int GetDmaCopySizeWithinAxis(TileAxis *axis); + int GetMinFactorForMinDataGranularity(TileAxis *axis); private: void DoMemInfer(); diff --git a/src/poly/tiling_strategy_manager.cc b/src/poly/tiling_strategy_manager.cc index 5e67ec31856ebaba68eac6d65c119b6f61ca509e..7b71748c102e3ee7042076373b4355202ec3bda2 100644 --- a/src/poly/tiling_strategy_manager.cc +++ b/src/poly/tiling_strategy_manager.cc @@ -433,14 +433,13 @@ void GemmStrategy::AddConstraint() { std::pair MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) { int max_core = cand_.GetCoreNumConf(); int used_core = 1; - std::pair proposal_range = std::make_pair( - std::max(static_cast(MIN_MULTICORE_BYTES / cand_.GetMinUbToGmDataAfterAxis(multicore_axis)), 1), -1); + std::pair proposal_range = std::make_pair(cand_.GetMinFactorForMinDataGranularity(multicore_axis), -1); auto this_level_core = std::max(static_cast(max_core / used_core), 1); std::stringstream ss; if (multicore_axis->range_extent.as() == nullptr) return proposal_range; auto shape = multicore_axis->range_extent.as()->value; bool is_last_level = false; - for (auto other_axis : cand_.GetTileAxis()) { + for (auto other_axis : this->cand_.GetTileAxis()) { if (other_axis == multicore_axis) break; if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue; if (other_axis->range_extent.as() == nullptr) return proposal_range; @@ -480,6 +479,7 @@ std::pair MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis logger_.AppendLog(DO_TILING, ss); return proposal_range; } + int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *multicore_axis, int64_t tiling_factor) { CHECK_GT(tiling_factor, 0) << "tiling factor cant be zero or negative"; auto proposal_range = GetProposalRangeForFullMulticore(multicore_axis); @@ -488,12 +488,19 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis * auto origin_factor = tiling_factor; std::stringstream ss; - if ((!multicore_axis->mc_sup) || (multicore_axis->HasAttr("REDUCE_AXIS")) || - (tiling_factor < cand_.GetMinFactorToEnableMulticore(multicore_axis) || - (tiling_factor == max_factor_for_full_cores) || (max_factor_for_full_cores <= 0))) { + if ((!multicore_axis->mc_sup) || (multicore_axis->HasAttr("REDUCE_AXIS") || (max_factor_for_full_cores <= 0))) { logger_.AppendLine(DO_TILING, "This axis is not suitable for multicore, return."); return origin_factor; } + if (tiling_factor < cand_.GetMinFactorToEnableMulticore(multicore_axis)) { + logger_.AppendLine(DO_TILING, "Inner-most tile size is smaller than 32 bytes, multicore is disable, return."); + return origin_factor; + } + if ((tiling_factor <= min_factor_for_enough_data) || + (min_factor_for_enough_data >= cand_.GetCoreNumConf() * max_factor_for_full_cores)) { + logger_.AppendLine(DO_TILING, "Cannot increase degree of parallelism by adjusting current tiling factor, return."); + return origin_factor; + } auto CheckConstConstraint = [this, &ss](Expr constraint) { if (constraint.as() == nullptr) { @@ -505,18 +512,27 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis * CheckConstConstraint(multicore_axis->l1_constraints.tile_min_); CheckConstConstraint(multicore_axis->l1_constraints.tile_mod_); + auto pending_blocks = cand_.GetMaximalPendingBlocks(multicore_axis); if (tiling_factor < max_factor_for_full_cores) { auto end = static_cast(sqrt(max_factor_for_full_cores)); - while (max_factor_for_full_cores % tiling_factor != 0 && tiling_factor > end) --tiling_factor; - } else { + while (max_factor_for_full_cores % tiling_factor != 0 && tiling_factor > end) { + --tiling_factor; + } + } else if (max_factor_for_full_cores >= min_factor_for_enough_data) { tiling_factor = max_factor_for_full_cores; + } else if (max_factor_for_full_cores < min_factor_for_enough_data) { + // In this case, simply adjusting tiling factor to max_factor_for_full_core may lead to insufficient data + // in each core while adjusting tiling factor to min_factor_for_enough_date may lead to fewer parallel cores. + // Since pending blocks can compensate data in each core, we make decision upon on its value. + tiling_factor = pending_blocks >= static_cast(min_factor_for_enough_data / max_factor_for_full_cores) + ? max_factor_for_full_cores + : min_factor_for_enough_data; } auto shape = multicore_axis->range_extent.as()->value; bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0); auto multicore_shrink_limit = 2; auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor); - auto pending_blocks = cand_.GetMaximalPendingBlocks(multicore_axis); if ((static_cast(origin_factor / tiling_factor) >= multicore_shrink_limit) && reduced_mem > pending_blocks) { ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;" << " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient."; diff --git a/src/poly/tiling_strategy_manager.h b/src/poly/tiling_strategy_manager.h index f83e475ae3026c873f18627f8bdd3d08f1332664..c2f6f87f1a0a11d45ef2051a24081caf5dd95df1 100644 --- a/src/poly/tiling_strategy_manager.h +++ b/src/poly/tiling_strategy_manager.h @@ -192,12 +192,12 @@ class MulticoreStrategy { MulticoreStrategy(TileCandidate &cand, const std::string log_file) : cand_(cand), logger_(TileLogger::GetInstance(log_file)) {} ~MulticoreStrategy() {} - std::pair GetProposalRangeForFullMulticore(TileAxis *axis); int64_t AdjustTilingAccordingToMulticoreConstraint(TileAxis *axis, int64_t tiling_factor); private: TileCandidate &cand_; TileLogger &logger_; + std::pair GetProposalRangeForFullMulticore(TileAxis *axis); }; } // namespace poly } // namespace ir