diff --git a/src/poly/tiling_analyzer.cc b/src/poly/tiling_analyzer.cc index ae2f2a6cde170e665c3127a45e820cbee995d748..b8338c4ac0cbdb7d4e59b0fc54b9252443800a10 100644 --- a/src/poly/tiling_analyzer.cc +++ b/src/poly/tiling_analyzer.cc @@ -717,7 +717,7 @@ int TileCandidate::GetMinFactorForMinDataGranularity(TileAxis *axis) { } granularity *= l1_val; } - return std::max(static_cast(MIN_MULTICORE_BYTES / granularity), 1); + return std::max(static_cast(MIN_CORE_GRANULARITY / granularity), 1); } /* diff --git a/src/poly/tiling_analyzer.h b/src/poly/tiling_analyzer.h index 1e4a2530c7453d1ca9b3d60c9e50dc3073fa0a2d..2d6aa2265783a6244d71a6acf6d46d471daf0368 100644 --- a/src/poly/tiling_analyzer.h +++ b/src/poly/tiling_analyzer.h @@ -52,7 +52,9 @@ constexpr auto DUMP_LEVEL_TUNING = 3; constexpr auto DUMP_LINE_BREAK_NUM = 100; constexpr auto GEN_PRIME_NUM = 32; constexpr auto VECTORIZE_BYTE = 256; -constexpr auto MIN_MULTICORE_BYTES = 256; +constexpr auto MAX_REPEAT = 255; +constexpr auto MIN_CORE_GRANULARITY = 256; +constexpr auto DESIRE_CORE_GRANULARITY = 8192; // 256 Bytes * 64 repeat // Controlled by custom tiling. constexpr auto ALLOCATION_PERCENTAGE = 0.5; // reserved for double buffer in default diff --git a/src/poly/tiling_strategy_manager.cc b/src/poly/tiling_strategy_manager.cc index da393e523ce21a2e3a4d647423af48432671d697..37afd6f3545509eb4f55fc0edf705c196f0c7b43 100644 --- a/src/poly/tiling_strategy_manager.cc +++ b/src/poly/tiling_strategy_manager.cc @@ -426,19 +426,47 @@ void GemmStrategy::AddConstraint() { } } -std::pair MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) { +// Adjust max core for element-wise and inner-most reduction operations to balance core number and granularity. +int MulticoreStrategy::GetProposalCoreNum() { int max_core = cand_.GetCoreNumConf(); + int problem_size = 1; + + for (auto axis : this->cand_.GetTileAxis()) { + if (axis->range_extent.as() == nullptr) { + return 0; + } + + if ((axis->HasAttr("TRANSFORM")) || (axis->HasAttr("TRANSPOSE")) || + (axis->HasAttr("REDUCE_AXIS") && !axis->HasAttr("REDUCE_SRC_LAST"))) { + return max_core; + } + + problem_size *= axis->range_extent.as()->value; + } + + if (problem_size < max_core * MIN_CORE_GRANULARITY * MAX_REPEAT) { + max_core = static_cast(problem_size / DESIRE_CORE_GRANULARITY); + if (max_core > 2 && max_core % 2 != 0) { + max_core--; + } + } + return max_core; +} + +std::pair MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) { + int max_core = GetProposalCoreNum(); int used_core = 1; std::pair proposal_range = std::make_pair(cand_.GetMinFactorForMinDataGranularity(multicore_axis), -1); auto this_level_core = std::max(static_cast(max_core / used_core), 1); std::stringstream ss; - if (multicore_axis->range_extent.as() == nullptr) return proposal_range; + if (multicore_axis->range_extent.as() == nullptr || this_level_core <= 1) { + return proposal_range; + } auto shape = multicore_axis->range_extent.as()->value; bool is_last_level = false; for (auto other_axis : this->cand_.GetTileAxis()) { if (other_axis == multicore_axis) break; if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue; - if (other_axis->range_extent.as() == nullptr) return proposal_range; int64_t l1_val = TileVarId::UNDEFINE; std::tie(l1_val, std::ignore) = cand_.GetConstTileVal(other_axis); if (l1_val == TileVarId::VAR) return proposal_range; @@ -529,7 +557,7 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis * bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0); auto multicore_shrink_limit = 2; auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor); - if ((static_cast(origin_factor / tiling_factor) >= multicore_shrink_limit) && reduced_mem > pending_blocks) { + if ((static_cast(origin_factor / tiling_factor) > multicore_shrink_limit) && reduced_mem > pending_blocks) { ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;" << " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient."; logger_.AppendLog(DO_TILING, ss); diff --git a/src/poly/tiling_strategy_manager.h b/src/poly/tiling_strategy_manager.h index 2330f1cd7ed9c2f3bb99270c1601d0ea432d63c0..27dff22d4f127f351e9f208d69fb58c49ab2713b 100644 --- a/src/poly/tiling_strategy_manager.h +++ b/src/poly/tiling_strategy_manager.h @@ -199,6 +199,7 @@ class MulticoreStrategy { TileCandidate &cand_; TileLogger &logger_; std::pair GetProposalRangeForFullMulticore(TileAxis *axis); + int GetProposalCoreNum(); }; class TilingPriorityScorer {