diff --git a/src/poly/tiling_analyzer.cc b/src/poly/tiling_analyzer.cc
index ae2f2a6cde170e665c3127a45e820cbee995d748..b8338c4ac0cbdb7d4e59b0fc54b9252443800a10 100644
--- a/src/poly/tiling_analyzer.cc
+++ b/src/poly/tiling_analyzer.cc
@@ -717,7 +717,7 @@ int TileCandidate::GetMinFactorForMinDataGranularity(TileAxis *axis) {
     }
     granularity *= l1_val;
   }
-  return std::max(static_cast<int>(MIN_MULTICORE_BYTES / granularity), 1);
+  return std::max(static_cast<int>(MIN_CORE_GRANULARITY / granularity), 1);
 }
 
 /*
diff --git a/src/poly/tiling_analyzer.h b/src/poly/tiling_analyzer.h
index 1e4a2530c7453d1ca9b3d60c9e50dc3073fa0a2d..2d6aa2265783a6244d71a6acf6d46d471daf0368 100644
--- a/src/poly/tiling_analyzer.h
+++ b/src/poly/tiling_analyzer.h
@@ -52,7 +52,9 @@ constexpr auto DUMP_LEVEL_TUNING = 3;
 constexpr auto DUMP_LINE_BREAK_NUM = 100;
 constexpr auto GEN_PRIME_NUM = 32;
 constexpr auto VECTORIZE_BYTE = 256;
-constexpr auto MIN_MULTICORE_BYTES = 256;
+constexpr auto MAX_REPEAT = 255;
+constexpr auto MIN_CORE_GRANULARITY = 256;
+constexpr auto DESIRE_CORE_GRANULARITY = 8192;  // 256 Bytes * 64 repeat
 
 // Controlled by custom tiling.
 constexpr auto ALLOCATION_PERCENTAGE = 0.5;  // reserved for double buffer in default
diff --git a/src/poly/tiling_strategy_manager.cc b/src/poly/tiling_strategy_manager.cc
index da393e523ce21a2e3a4d647423af48432671d697..37afd6f3545509eb4f55fc0edf705c196f0c7b43 100644
--- a/src/poly/tiling_strategy_manager.cc
+++ b/src/poly/tiling_strategy_manager.cc
@@ -426,19 +426,47 @@ void GemmStrategy::AddConstraint() {
   }
 }
 
-std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) {
+// Adjust max core for element-wise and inner-most reduction operations to balance core number and granularity.
+int MulticoreStrategy::GetProposalCoreNum() {
   int max_core = cand_.GetCoreNumConf();
+  int problem_size = 1;
+
+  for (auto axis : this->cand_.GetTileAxis()) {
+    if (axis->range_extent.as<IntImm>() == nullptr) {
+      return 0;
+    }
+
+    if ((axis->HasAttr("TRANSFORM")) || (axis->HasAttr("TRANSPOSE")) ||
+        (axis->HasAttr("REDUCE_AXIS") && !axis->HasAttr("REDUCE_SRC_LAST"))) {
+      return max_core;
+    }
+
+    problem_size *= axis->range_extent.as<IntImm>()->value;
+  }
+
+  if (problem_size < max_core * MIN_CORE_GRANULARITY * MAX_REPEAT) {
+    max_core = static_cast<int>(problem_size / DESIRE_CORE_GRANULARITY);
+    if (max_core > 2 && max_core % 2 != 0) {
+      max_core--;
+    }
+  }
+  return max_core;
+}
+
+std::pair<int, int> MulticoreStrategy::GetProposalRangeForFullMulticore(TileAxis *multicore_axis) {
+  int max_core = GetProposalCoreNum();
   int used_core = 1;
   std::pair<int, int> proposal_range = std::make_pair(cand_.GetMinFactorForMinDataGranularity(multicore_axis), -1);
   auto this_level_core = std::max(static_cast<int>(max_core / used_core), 1);
   std::stringstream ss;
-  if (multicore_axis->range_extent.as<IntImm>() == nullptr) return proposal_range;
+  if (multicore_axis->range_extent.as<IntImm>() == nullptr || this_level_core <= 1) {
+    return proposal_range;
+  }
   auto shape = multicore_axis->range_extent.as<IntImm>()->value;
   bool is_last_level = false;
   for (auto other_axis : this->cand_.GetTileAxis()) {
     if (other_axis == multicore_axis) break;
     if (other_axis->index != multicore_axis->index || other_axis->HasAttr("REDUCE_AXIS")) continue;
-    if (other_axis->range_extent.as<IntImm>() == nullptr) return proposal_range;
     int64_t l1_val = TileVarId::UNDEFINE;
     std::tie(l1_val, std::ignore) = cand_.GetConstTileVal(other_axis);
     if (l1_val == TileVarId::VAR) return proposal_range;
@@ -529,7 +557,7 @@ int64_t MulticoreStrategy::AdjustTilingAccordingToMulticoreConstraint(TileAxis *
   bool efficient = (shape % tiling_factor == 0) >= (shape % origin_factor == 0);
   auto multicore_shrink_limit = 2;
   auto reduced_mem = std::max(origin_factor - tiling_factor, min_factor_for_enough_data - tiling_factor);
-  if ((static_cast<int>(origin_factor / tiling_factor) >= multicore_shrink_limit) && reduced_mem > pending_blocks) {
+  if ((static_cast<int>(origin_factor / tiling_factor) > multicore_shrink_limit) && reduced_mem > pending_blocks) {
     ss << "If axis adjust to " << tiling_factor << ", " << reduced_mem << " memory is reduced;"
        << " while maximal pending blocks is only " << pending_blocks << ", adjust may not be efficient.";
     logger_.AppendLog(DO_TILING, ss);
diff --git a/src/poly/tiling_strategy_manager.h b/src/poly/tiling_strategy_manager.h
index 2330f1cd7ed9c2f3bb99270c1601d0ea432d63c0..27dff22d4f127f351e9f208d69fb58c49ab2713b 100644
--- a/src/poly/tiling_strategy_manager.h
+++ b/src/poly/tiling_strategy_manager.h
@@ -199,6 +199,7 @@ class MulticoreStrategy {
   TileCandidate &cand_;
   TileLogger &logger_;
   std::pair<int, int> GetProposalRangeForFullMulticore(TileAxis *axis);
+  int GetProposalCoreNum();
 };
 
 class TilingPriorityScorer {