diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc
index ad60447e706613252affbac36d635a8f88193a71..191e6c35368619a5ba4671b59362a2043ebdf892 100644
--- a/mace/core/runtime/cpu/cpu_runtime.cc
+++ b/mace/core/runtime/cpu/cpu_runtime.cc
@@ -31,16 +31,12 @@
 #include "mace/public/mace.h"
 #include "mace/utils/macros.h"
 #include "mace/utils/logging.h"
+#include "mace/utils/thread_pool.h"
 
 namespace mace {
 
 int MaceOpenMPThreadCount = 1;
 
-struct CPUFreq {
-  size_t core_id;
-  float freq;
-};
-
 enum SchedulePolicy {
   SCHED_STATIC,
   SCHED_GUIDED,
@@ -105,28 +101,12 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
     return MaceStatus::MACE_RUNTIME_ERROR;
   }
 
-  std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
-  for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
-    cpu_freq[i].core_id = i;
-    cpu_freq[i].freq = cpu_max_freqs[i];
-  }
-  if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
-      policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
-    std::sort(cpu_freq.begin(),
-              cpu_freq.end(),
-              [=](const CPUFreq &lhs, const CPUFreq &rhs) {
-                return lhs.freq < rhs.freq;
-              });
-  } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
-      policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
-    std::sort(cpu_freq.begin(),
-              cpu_freq.end(),
-              [](const CPUFreq &lhs, const CPUFreq &rhs) {
-                return lhs.freq > rhs.freq;
-              });
-  }
+  std::vector<size_t> cores_to_use;
+  MACE_RETURN_IF_ERROR(
+      mace::utils::GetCPUCoresToUse(
+          cpu_max_freqs, policy, num_threads_hint, &cores_to_use));
 
-  int cpu_count = static_cast<int>(cpu_freq.size());
+  int cpu_count = static_cast<int>(cores_to_use.size());
   if (num_threads_hint <= 0 || num_threads_hint > cpu_count) {
     num_threads_hint = cpu_count;
   }
@@ -148,32 +128,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
     return MaceStatus::MACE_SUCCESS;
   }
 
-
-  // decide num of cores to use
-  int cores_to_use = 0;
-  if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
-      || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
-    for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
-      if (cpu_freq[i].freq != cpu_freq[0].freq) {
-        break;
-      }
-      ++cores_to_use;
-    }
-    num_threads_hint = std::min(num_threads_hint, cores_to_use);
-  } else {
-    cores_to_use = num_threads_hint;
-  }
-  MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
-
-  VLOG(2) << "Use " << num_threads_hint << " threads";
-  std::vector<size_t> cpu_ids(cores_to_use);
-  for (int i = 0; i < cores_to_use; ++i) {
-    VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq "
-            << cpu_freq[i].freq;
-    cpu_ids[i] = cpu_freq[i].core_id;
-  }
   SchedulePolicy sched_policy = SCHED_GUIDED;
-  if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) {
+  float first_freq = cpu_max_freqs[cores_to_use[0]];
+  float last_freq = cpu_max_freqs[cores_to_use[cores_to_use.size() - 1]];
+  if (std::abs(first_freq - last_freq) < 1e-6) {
     sched_policy = SCHED_STATIC;
   }
 
@@ -185,7 +143,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
 #endif  // MACE_ENABLE_QUANTIZE
 
   return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint,
-                                         cpu_ids,
+                                         cores_to_use,
                                          sched_policy);
 }
 
diff --git a/mace/port/android/env.cc b/mace/port/android/env.cc
index fa338f078afef4ba6dbf5bb9930e554aab2b8292..a247cea0b7ff052d6b306487379e691a82fc2b27 100644
--- a/mace/port/android/env.cc
+++ b/mace/port/android/env.cc
@@ -17,7 +17,6 @@
 #include <errno.h>
 #include <unwind.h>
 #include <dlfcn.h>
-#include <unistd.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
 
@@ -50,46 +49,60 @@ LogWriter *AndroidEnv::GetLogWriter() {
 namespace {
 
 struct BacktraceState {
-  void** current;
-  void** end;
+  void **current;
+  void **end;
 };
 
-_Unwind_Reason_Code UnwindCallback(struct _Unwind_Context* context, void* arg) {
-  BacktraceState* state = static_cast<BacktraceState*>(arg);
+_Unwind_Reason_Code UnwindCallback(struct _Unwind_Context *context, void *arg) {
+  BacktraceState *state = static_cast<BacktraceState *>(arg);
   uintptr_t pc = _Unwind_GetIP(context);
   if (pc) {
     if (state->current == state->end) {
       return _URC_END_OF_STACK;
     } else {
-      *state->current++ = reinterpret_cast<void*>(pc);
+      *state->current++ = reinterpret_cast<void *>(pc);
     }
   }
   return _URC_NO_REASON;
 }
 
-size_t BackTrace(void** buffer, size_t max) {
+size_t BackTrace(void **buffer, size_t max) {
   BacktraceState state = {buffer, buffer + max};
   _Unwind_Backtrace(UnwindCallback, &state);
 
   return state.current - buffer;
 }
 
+bool CpuIsolate(size_t cpu_id) {
+  std::string cpuinfo_isolate_conf = MakeString(
+      "/sys/devices/system/cpu/cpu",
+      cpu_id,
+      "/isolate");
+  std::ifstream isolate_file(cpuinfo_isolate_conf);
+  int isolate_switch = 0;
+  if (isolate_file.is_open()) {
+    std::string line;
+    if (std::getline(isolate_file, line)) {
+      isolate_switch = strtol(line.c_str(), nullptr, 0);
+    }
+    isolate_file.close();
+  }
+
+  return (isolate_switch != 0);
+}
+
 }  // namespace
 
-MaceStatus AndroidEnv::SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
-  // compute mask
-  cpu_set_t mask;
-  CPU_ZERO(&mask);
-  for (auto cpu_id : cpu_ids) {
-    CPU_SET(cpu_id, &mask);
-  }
-  pid_t pid = gettid();
-  int err = sched_setaffinity(pid, sizeof(mask), &mask);
-  if (err) {
-    LOG(WARNING) << "SchedSetAffinity failed: " << strerror(errno);
-    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
-                      "SchedSetAffinity failed: " +
-                      std::string(strerror(errno)));
+MaceStatus AndroidEnv::GetCPUMaxFreq(std::vector<float> *max_freqs) {
+  MACE_RETURN_IF_ERROR(LinuxBaseEnv::GetCPUMaxFreq(max_freqs));
+
+  size_t cpu_num = (max_freqs != nullptr) ? max_freqs->size() : 0;
+  if (cpu_num > 0) {
+    for (size_t i = 0; i < cpu_num; ++i) {
+      if (CpuIsolate(i)) {
+        (*max_freqs)[i] = 0;
+      }
+    }
   }
 
   return MaceStatus::MACE_SUCCESS;
@@ -103,8 +116,8 @@ std::vector<std::string> AndroidEnv::GetBackTraceUnsafe(int max_steps) {
   for (int i = 0; i < steps; ++i) {
     std::ostringstream os;
 
-    const void* addr = buffer[i];
-    const char* symbol = "";
+    const void *addr = buffer[i];
+    const char *symbol = "";
     Dl_info info;
     if (dladdr(addr, &info) && info.dli_sname) {
       symbol = info.dli_sname;
diff --git a/mace/port/android/env.h b/mace/port/android/env.h
index 071340367bf39a03b65837eaea68f105852fce2f..39d16d95468e1b26c8983f1b6700bccb4834ceea 100644
--- a/mace/port/android/env.h
+++ b/mace/port/android/env.h
@@ -29,8 +29,8 @@ namespace port {
 
 class AndroidEnv : public LinuxBaseEnv {
  public:
-  MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
   LogWriter *GetLogWriter() override;
+  MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs) override;
   std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
   std::unique_ptr<MallocLogger> NewMallocLogger(
       std::ostringstream *oss,
diff --git a/mace/port/darwin/env.cc b/mace/port/darwin/env.cc
index 3344adbbc487b6bedbd745157c205ab6680ddfb0..2e4a3694ac32ab97d6fb80f61c28c51c52dc7abf 100644
--- a/mace/port/darwin/env.cc
+++ b/mace/port/darwin/env.cc
@@ -15,6 +15,8 @@
 #include "mace/port/darwin/env.h"
 
 #include <execinfo.h>
+#include <mach/thread_act.h>
+#include <mach/thread_policy.h>
 #include <stdint.h>
 #include <sys/sysctl.h>
 #include <sys/time.h>
@@ -33,27 +35,64 @@ namespace mace {
 namespace port {
 
 namespace {
-const char kCpuFrequencyMax[] = "hw.cpufrequency_max";
+
+constexpr const char kCpuFrequencyMax[] = "hw.cpufrequency_max";
+constexpr const char kCpuActiveNum[] = "hw.activecpu";
+
 }
 
 int64_t DarwinEnv::NowMicros() {
   return mace::port::posix::NowMicros();
 }
 
-// TODO(luxuhui): this func is not accurate, darwin does not support
-// acquiring CPU frequencies, we need to reconsider the CPU scheduling
-// strategy.
-MaceStatus DarwinEnv::GetCPUMaxFreq(std::vector<float> *max_freqs) {
-  MACE_CHECK_NOTNULL(max_freqs);
+// we can't get the frequancy of every cpu on darwin, so this method
+// return a fake frequancy data.
+MaceStatus DarwinEnv::GetCPUMaxFreq(std::vector<float> *cpu_infos) {
+  MACE_CHECK_NOTNULL(cpu_infos);
 
-  uint64_t freq = 0;
+  float freq = 0;
   size_t size = sizeof(freq);
   int ret = sysctlbyname(kCpuFrequencyMax, &freq, &size, NULL, 0);
   if (ret < 0) {
     LOG(ERROR) << "failed to get property: " << kCpuFrequencyMax;
     return MaceStatus::MACE_RUNTIME_ERROR;
   }
-  max_freqs->push_back(freq);
+
+  uint64_t cpu_num = 0;
+  size = sizeof(cpu_num);
+  ret = sysctlbyname(kCpuActiveNum, &cpu_num, &size, NULL, 0);
+  if (ret < 0) {
+    LOG(ERROR) << "failed to get property: " << kCpuActiveNum;
+    return MaceStatus::MACE_RUNTIME_ERROR;
+  }
+
+  for (int i = 0; i < cpu_num; ++i) {
+    cpu_infos->push_back(freq);
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
+MaceStatus DarwinEnv::SchedSetAffinity(
+    const std::vector<size_t> &cpu_ids) {
+  unsigned int tag = 0;
+  for (size_t i = 0; i < cpu_ids.size(); ++i) {
+    tag += (cpu_ids[i] << i);
+  }
+
+#ifdef MACE_OS_MAC
+  pthread_t thread = pthread_self();
+  mach_port_t mach_port = pthread_mach_thread_np(thread);
+  thread_affinity_policy_data_t policy_data = {(integer_t) tag};
+  int ret = thread_policy_set(mach_port,
+                              THREAD_AFFINITY_POLICY,
+                              (thread_policy_t) & policy_data,
+                              1);
+  if (ret) {
+    LOG(INFO) << "thread_policy_set failed: " << strerror(errno);
+    return MaceStatus::MACE_RUNTIME_ERROR;
+  }
+#endif
 
   return MaceStatus::MACE_SUCCESS;
 }
diff --git a/mace/port/darwin/env.h b/mace/port/darwin/env.h
index 7205bb7fa97fd020a294198d45b47114f6ee4873..d709af6a6696de7b3a86c4cc71a2c67fb72ea484 100644
--- a/mace/port/darwin/env.h
+++ b/mace/port/darwin/env.h
@@ -20,6 +20,7 @@
 
 #include "mace/port/env.h"
 #include "mace/port/logger.h"
+#include "mace/port/port-arch.h"
 #include "mace/port/posix/file_system.h"
 
 namespace mace {
@@ -29,6 +30,7 @@ class DarwinEnv : public Env {
  public:
   int64_t NowMicros() override;
   MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs) override;
+  MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
   FileSystem *GetFileSystem() override;
   LogWriter *GetLogWriter() override;
   std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
diff --git a/mace/port/linux/env.cc b/mace/port/linux/env.cc
index 00831c5ed89c11a1163c57a1a83a6bdbdd386f62..6e534516db6940b00f9b57910dbd5523f0cf8be8 100644
--- a/mace/port/linux/env.cc
+++ b/mace/port/linux/env.cc
@@ -25,10 +25,21 @@
 #include "mace/port/posix/backtrace.h"
 #include "mace/port/posix/file_system.h"
 #include "mace/port/posix/time.h"
+#include "mace/utils/macros.h"
 
 namespace mace {
 namespace port {
 
+// In our embedded linux device, SchedSetAffinity has side effects
+// on performance, so we override this method to do nothing. You
+// can try to comment this function, perhaps you could get a better
+// performance as we do in Android devices.
+MaceStatus LinuxEnv::SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
+  MACE_UNUSED(cpu_ids);
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
 LogWriter *LinuxEnv::GetLogWriter() {
   return &log_writer_;
 }
diff --git a/mace/port/linux/env.h b/mace/port/linux/env.h
index 825dd29d9afe11fe1fd234ad1e1ba888381a403d..9e2dc517a3820d8c993d8272cc6eab72647f24df 100644
--- a/mace/port/linux/env.h
+++ b/mace/port/linux/env.h
@@ -26,6 +26,7 @@ namespace port {
 
 class LinuxEnv : public LinuxBaseEnv {
  public:
+  MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
   LogWriter *GetLogWriter() override;
   std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
 
diff --git a/mace/port/linux_base/env.cc b/mace/port/linux_base/env.cc
index 335e0e31b60f8a70afd3666b5dd04d3118458c7a..10b946ac62de806ddc7f1f1cf113530b6e4d1924 100644
--- a/mace/port/linux_base/env.cc
+++ b/mace/port/linux_base/env.cc
@@ -14,7 +14,10 @@
 
 #include "mace/port/linux_base/env.h"
 
+#include <errno.h>
+#include <sys/syscall.h>
 #include <sys/time.h>
+#include <unistd.h>
 
 #include <cstddef>
 #include <fstream>
@@ -28,7 +31,6 @@
 namespace mace {
 namespace port {
 
-
 namespace {
 
 int GetCPUCount() {
@@ -100,5 +102,24 @@ MaceStatus LinuxBaseEnv::GetCPUMaxFreq(std::vector<float> *max_freqs) {
   return MaceStatus::MACE_SUCCESS;
 }
 
+MaceStatus LinuxBaseEnv::SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
+  cpu_set_t mask;
+  CPU_ZERO(&mask);
+  for (auto cpu_id : cpu_ids) {
+    CPU_SET(cpu_id, &mask);
+  }
+
+  pid_t pid = syscall(SYS_gettid);
+  int err = sched_setaffinity(pid, sizeof(mask), &mask);
+  if (err) {
+    LOG(WARNING) << "SchedSetAffinity failed: " << strerror(errno);
+    return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
+                      "SchedSetAffinity failed: " +
+                          std::string(strerror(errno)));
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
+
 }  // namespace port
 }  // namespace mace
diff --git a/mace/port/linux_base/env.h b/mace/port/linux_base/env.h
index 07270f2a7b3eaef3997f5a94e87a218fa5b64ca0..7ef0e9fcd3149cb681b4d8ccafe0ecf9dee7bc2a 100644
--- a/mace/port/linux_base/env.h
+++ b/mace/port/linux_base/env.h
@@ -28,6 +28,7 @@ class LinuxBaseEnv : public Env {
   int64_t NowMicros() override;
   MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs) override;
   FileSystem *GetFileSystem() override;
+  MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
 
  protected:
   PosixFileSystem posix_file_system_;
diff --git a/mace/utils/thread_pool.cc b/mace/utils/thread_pool.cc
index 8cbdbf140e65c4aee6c22378b6996d8616c7ee24..5fa3ad6e419b8f597c803d173bf9ec47b94c6fd2 100644
--- a/mace/utils/thread_pool.cc
+++ b/mace/utils/thread_pool.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <numeric>
+
 #include "mace/port/port.h"
 #include "mace/port/env.h"
 #include "mace/utils/logging.h"
@@ -26,6 +28,8 @@ namespace utils {
 constexpr int kThreadPoolSpinWaitTime = 2000000;  // ns
 constexpr int kTileCountPerThread = 2;
 constexpr int kMaxCostUsingSingleThread = 100;
+constexpr int kMinCpuCoresForPerformance = 3;
+constexpr int kMaxCpuCoresForPerformance = 5;
 
 namespace {
 
@@ -42,67 +46,87 @@ struct CPUFreq {
   float freq;
 };
 
-void GetCPUCoresToUse(const std::vector<float> &cpu_max_freqs,
-                      const CPUAffinityPolicy policy,
-                      const size_t thread_count_hint,
-                      std::vector<size_t> *cores) {
-  size_t thread_count = thread_count_hint;
-  if (!cpu_max_freqs.empty()) {
-    const size_t cpu_count = cpu_max_freqs.size();
-    if (thread_count == 0 || thread_count > cpu_count) {
-      thread_count = cpu_count;
+size_t GetCpuCoresForPerfomance(const std::vector<CPUFreq> &cpu_freqs) {
+  float total_freq = std::accumulate(cpu_freqs.begin(), cpu_freqs.end(), 0,
+                                     [](float accum, CPUFreq cpu_freq) {
+                                       return accum + cpu_freq.freq;
+                                     });
+  size_t valid_cpu_nums = std::count_if(cpu_freqs.begin(), cpu_freqs.end(),
+                                        [](CPUFreq cpu_freq) {
+                                          return cpu_freq.freq != 0;
+                                        });
+  float avg_freq = total_freq / valid_cpu_nums;
+
+  size_t cores_to_use = 0;
+  for (auto cpu_info : cpu_freqs) {
+    if ((cpu_info.freq > avg_freq
+        && cores_to_use < kMaxCpuCoresForPerformance)
+        || cores_to_use < kMinCpuCoresForPerformance) {
+      ++cores_to_use;
     }
+  }
 
-    if (policy != CPUAffinityPolicy::AFFINITY_NONE) {
-      std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
-      for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
-        cpu_freq[i].core_id = i;
-        cpu_freq[i].freq = cpu_max_freqs[i];
-      }
-      if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
-          policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
-        std::sort(cpu_freq.begin(),
-                  cpu_freq.end(),
-                  [=](const CPUFreq &lhs, const CPUFreq &rhs) {
-                    return lhs.freq < rhs.freq;
-                  });
-      } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
-          policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
-        std::sort(cpu_freq.begin(),
-                  cpu_freq.end(),
-                  [](const CPUFreq &lhs, const CPUFreq &rhs) {
-                    return lhs.freq > rhs.freq;
-                  });
-      }
+  return cores_to_use;
+}
 
-      // decide num of cores to use
-      size_t cores_to_use = 0;
-      if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
-          || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
-        for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
-          if (cpu_freq[i].freq != cpu_freq[0].freq) {
-            break;
-          }
-          ++cores_to_use;
-        }
-      } else {
-        cores_to_use = thread_count;
-      }
-      MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
-      cores->resize(cores_to_use);
-      for (size_t i = 0; i < cores_to_use; ++i) {
-        VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id
-                << " with freq "
-                << cpu_freq[i].freq;
-        (*cores)[i] = static_cast<int>(cpu_freq[i].core_id);
-      }
-    }
-  } else {
+}  // namespace
+
+MaceStatus GetCPUCoresToUse(const std::vector<float> &cpu_max_freqs,
+                            const CPUAffinityPolicy policy,
+                            const size_t thread_count_hint,
+                            std::vector<size_t> *cores) {
+  if (cpu_max_freqs.empty()) {
     LOG(ERROR) << "CPU core is empty";
+    return MaceStatus::MACE_RUNTIME_ERROR;
+  }
+  size_t thread_count = thread_count_hint;
+  const size_t cpu_count = cpu_max_freqs.size();
+  if (thread_count == 0 || thread_count > cpu_count) {
+    thread_count = cpu_count;
   }
-}
 
-}  // namespace
+  if (policy != CPUAffinityPolicy::AFFINITY_NONE) {
+    std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
+    for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
+      cpu_freq[i].core_id = i;
+      cpu_freq[i].freq = cpu_max_freqs[i];
+    }
+    if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
+        policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
+      std::sort(cpu_freq.begin(),
+                cpu_freq.end(),
+                [=](const CPUFreq &lhs, const CPUFreq &rhs) {
+                  return lhs.freq < rhs.freq;
+                });
+    } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
+        policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
+      std::sort(cpu_freq.begin(),
+                cpu_freq.end(),
+                [](const CPUFreq &lhs, const CPUFreq &rhs) {
+                  return lhs.freq > rhs.freq;
+                });
+    }
+
+    // decide num of cores to use
+    size_t cores_to_use = 0;
+    if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
+        || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
+      cores_to_use = GetCpuCoresForPerfomance(cpu_freq);
+    } else {
+      cores_to_use = thread_count;
+    }
+    MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
+    cores->resize(cores_to_use);
+    for (size_t i = 0; i < cores_to_use; ++i) {
+      VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id
+              << " with freq "
+              << cpu_freq[i].freq;
+      (*cores)[i] = static_cast<int>(cpu_freq[i].core_id);
+    }
+  }
+
+  return MaceStatus::MACE_SUCCESS;
+}
 
 ThreadPool::ThreadPool(const size_t thread_count_hint,
                        const CPUAffinityPolicy policy)
@@ -173,13 +197,13 @@ void ThreadPool::Run(const std::function<void(const int64_t)> &func,
   std::unique_lock<std::mutex> run_lock(run_mutex_);
 
   for (size_t i = 0; i < thread_count; ++i) {
-    int64_t count = iters_per_thread + (static_cast<int64_t>(i) < remainder);
+    int64_t range_len =
+        iters_per_thread + (static_cast<int64_t>(i) < remainder);
     thread_infos_[i].range_start = iters_offset;
-    int64_t range_end = iters_offset + count;
-    thread_infos_[i].range_end = range_end;
-    thread_infos_[i].range_len = range_end - iters_offset;
+    thread_infos_[i].range_len = range_len;
+    thread_infos_[i].range_end = iters_offset + range_len;
     thread_infos_[i].func = reinterpret_cast<uintptr_t>(&func);
-    iters_offset += thread_infos_[i].range_len;
+    iters_offset = thread_infos_[i].range_end;
   }
 
   count_down_latch_.Reset(thread_count - 1);
diff --git a/mace/utils/thread_pool.h b/mace/utils/thread_pool.h
index 90d30257bf66da0b7d6d82776b87071779396b9f..5e77c8df1d05012d690ef7088d97439ce5d6e637 100644
--- a/mace/utils/thread_pool.h
+++ b/mace/utils/thread_pool.h
@@ -29,6 +29,11 @@
 namespace mace {
 namespace utils {
 
+MaceStatus GetCPUCoresToUse(const std::vector<float> &cpu_max_freqs,
+                            const CPUAffinityPolicy policy,
+                            const size_t thread_count_hint,
+                            std::vector<size_t> *cores);
+
 class ThreadPool {
  public:
   ThreadPool(const size_t thread_count,