From d920ab32ba72c8e79264a18d4881453ef9fc7185 Mon Sep 17 00:00:00 2001 From: dabaiji Date: Wed, 8 Jul 2020 17:11:42 +0800 Subject: [PATCH] add core number and core granularity balance model --- src/poly/tiling_analyzer.cc | 2 +- src/poly/tiling_analyzer.h | 4 +++- src/poly/tiling_strategy_manager.cc | 36 +++++++++++++++++++++++++---- src/poly/tiling_strategy_manager.h | 1 + 4 files changed, 37 insertions(+), 6 deletions(-) diff --git a/src/poly/tiling_analyzer.cc b/src/poly/tiling_analyzer.cc index ae2f2a6..b8338c4 100644 --- a/src/poly/tiling_analyzer.cc +++ b/src/poly/tiling_analyzer.cc @@ -717,7 +717,7 @@ int TileCandidate::GetMinFactorForMinDataGranularity(TileAxis *axis) { } granularity *= l1_val; } - return std::max(static_cast(MIN_MULTICORE_BYTES / granularity), 1); + return std::max(static_cast(MIN_CORE_GRANULARITY / granularity), 1); } /* diff --git a/src/poly/tiling_analyzer.h b/src/poly/tiling_analyzer.h index 1e4a253..2d6aa22 100644 --- a/src/poly/tiling_analyzer.h +++ b/src/poly/tiling_analyzer.h @@ -52,7 +52,9 @@ constexpr auto DUMP_LEVEL_TUNING = 3; constexpr auto DUMP_LINE_BREAK_NUM = 100; constexpr auto GEN_PRIME_NUM = 32; constexpr auto VECTORIZE_BYTE = 256; -constexpr auto MIN_MULTICORE_BYTES = 256; +constexpr auto MAX_REPEAT = 255; +constexpr auto MIN_CORE_GRANULARITY = 256; +constexpr auto DESIRE_CORE_GRANULARITY = 8192; // 256 Bytes * 64 repeat // Controlled by custom tiling. constexpr auto ALLOCATION_PERCENTAGE = 0.5; // reserved for double buffer in default diff --git a/src/poly/tiling_strategy_manager.cc b/src/poly/tiling_strategy_manager.cc index da393e5..37afd6f 100644 --- a/src/poly/tiling_strategy_manager.cc +++ b/src/poly/tiling_strategy_manager.cc @@ -426,19 +426,47 @@ void GemmStrategy::AddConstraint() { } } -std::pair MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) { +// Adjust max core for element-wise and inner-most reduction operations to balance core number and granularity. +int MulticoreStrategy::GetProposalCoreNum() { int max_core = cand_.GetCoreNumConf(); + int problem_size = 1; + + for (auto axis : this->cand_.GetTileAxis()) { + if (axis->range_extent.as() == nullptr) { + return 0; + } + + if ((axis->HasAttr("TRANSFORM")) || (axis->HasAttr("TRANSPOSE")) || + (axis->HasAttr("REDUCE_AXIS") && !axis->HasAttr("REDUCE_SRC_LAST"))) { + return max_core; + } + + problem_size *= axis->range_extent.as()->value; + } + + if (problem_size < max_core * MIN_CORE_GRANULARITY * MAX_REPEAT) { + max_core = static_cast(problem_size / DESIRE_CORE_GRANULARITY); + if (max_core > 2 && max_core % 2 != 0) { + max_core--; + } + } + return max_core; +} + +std::pair MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) { + int max_core = GetProposalCoreNum(); int used_core = 1; std::pair proposal_range = std::make_pair(cand_.GetMinFactorForMinDataGranularity(multicore_axis), -1); auto this_level_core = std::max(static_cast(max_core / used_core), 1); std::stringstream ss; - if (multicore_axis->range_extent.as() == nullptr) return proposal_range; + if (multicore_axis->range_extent.as() == nullptr || this_level_core <= 1) { + return proposal_range; + } auto shape = multicore_axis->range_extent.as()->value; bool is_last_level = false; for (auto other_axis : this->cand_.GetTileAxis()) { if (other_axis == multicore_axis) break; if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue; - if (other_axis->range_extent.as() == nullptr) return proposal_range; int64_t l1_val = TileVarId::UNDEFINE; std::tie(l1_val, std::ignore) = cand_.GetConstTileVal(other_axis); if (l1_val == TileVarId::VAR) return proposal_range; @@ -529,7 +557,7 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis * bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0); auto multicore_shrink_limit = 2; auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor); - if ((static_cast(origin_factor / tiling_factor) >= multicore_shrink_limit) && reduced_mem > pending_blocks) { + if ((static_cast(origin_factor / tiling_factor) > multicore_shrink_limit) && reduced_mem > pending_blocks) { ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;" << " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient."; logger_.AppendLog(DO_TILING, ss); diff --git a/src/poly/tiling_strategy_manager.h b/src/poly/tiling_strategy_manager.h index 2330f1c..27dff22 100644 --- a/src/poly/tiling_strategy_manager.h +++ b/src/poly/tiling_strategy_manager.h @@ -199,6 +199,7 @@ class MulticoreStrategy { TileCandidate &cand_; TileLogger &logger_; std::pair GetProposalRangeForFullMulticore(TileAxis *axis); + int GetProposalCoreNum(); }; class TilingPriorityScorer { -- GitLab