From ce46ef22d6efc2696bf677c61c33536a21b92a8f Mon Sep 17 00:00:00 2001
From: hong19860320 <9973393+hong19860320@users.noreply.github.com>
Date: Thu, 20 Jun 2019 03:37:49 +0000
Subject: [PATCH] ARM cpu_info refine test=develop

---
 paddle/fluid/lite/core/context.h   |    9 +-
 paddle/fluid/lite/core/cpu_info.cc | 1384 +++++++++++++++-------------
 paddle/fluid/lite/core/cpu_info.h  |  104 +--
 3 files changed, 773 insertions(+), 724 deletions(-)
diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h
index 81041dfc9..a79ce04fa 100644
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -67,7 +67,7 @@ class Context<TargetType::kARM> {
   ARMContext& operator=(const ARMContext& ctx) {}
 
   // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() {}
+  void InitOnce() { DeviceInfo::Init(); }
 
   void CopyShared(const ARMContext* ctx) {}
 
@@ -78,20 +78,19 @@ class Context<TargetType::kARM> {
     return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
   }
   void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
-  void BindDev() { return DeviceInfo::Global().BindDev(); }
 
   PowerMode mode() const { return DeviceInfo::Global().mode(); }
   int threads() const { return DeviceInfo::Global().threads(); }
   ARMArch arch() const { return DeviceInfo::Global().arch(); }
+  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
+  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
+  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
 
   template <typename T>
   T* workspace_data() {
     return DeviceInfo::Global().workspace_data<T>();
   }
 
-  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
-  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
-  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
   bool ExtendWorkspace(DDimLite dims) {
     return DeviceInfo::Global().ExtendWorkspace(dims);
   }
diff --git a/paddle/fluid/lite/core/cpu_info.cc b/paddle/fluid/lite/core/cpu_info.cc
index 52cd47fb5..40353631f 100644
--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -29,7 +29,8 @@
 #include <omp.h>
 #endif
 
-#include <cstdarg>
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/lite/core/cpu_info.h"
 
 namespace paddle {
@@ -37,549 +38,55 @@ namespace lite {
 
 #ifdef LITE_WITH_ARM
 
-void DeviceInfo::InitInternal(DeviceInfo* dev) {
-  set_default_cache(dev);
-  dev->compute_core_num_ = arm_get_cpucount();
-  dev->max_memory_ = arm_get_meminfo();
-
-// get max freq
-#ifdef LITE_WITH_LINUX
-  std::vector<int> max_freq(dev->compute_core_num_);
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    max_freq[i] = get_max_freq_khz(i) / 1000;
-  }
-  std::string cpu_name = arm_get_cpu_name();
-  if (get_cpu_info_from_name(dev, cpu_name) != true) {
-    arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_,
-                                    max_freq, &dev->cluster_ids_);
-    dev->big_core_ids_.clear();
-    dev->little_core_ids_.clear();
-    for (int i = 0; i < dev->cluster_ids_.size(); ++i) {
-      if (dev->cluster_ids_[i] == 0) {
-        dev->big_core_ids_.push_back(dev->core_ids_[i]);
-      } else {
-        dev->little_core_ids_.push_back(dev->core_ids_[i]);
-      }
-    }
-    arm_get_cpu_arch(&dev->archs_);
-  }
-
-  LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_;
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i]
-              << ", frequence: " << max_freq[i]
-              << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
-              << ", CPU ARCH: A" << dev->archs_[i];
-  }
-  VLOG(1) << "L1 DataCache size is: ";
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
-  }
-  VLOG(1) << "L2 Cache size is: ";
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
-  }
-  VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
-
-  dev->max_freq_ = max_freq[0];
-  for (int j = 1; j < dev->compute_core_num_; ++j) {
-    if (dev->max_freq_ < max_freq[j]) {
-      dev->max_freq_ = max_freq[j];
-    }
-  }
-#elif defined(TARGET_IOS)
-  arm_get_cpu_arch(&dev->archs_);
-#endif
-  dev->active_ids_ = {0};
-  dev->mode_ = LITE_POWER_HIGH;
-  dev->workspace_.Resize({static_cast<int64_t>(
-      dev->L2_cache_[dev->active_ids_[0]] / sizeof(float))});
 #ifdef TARGET_IOS
-  dev->arch_ = APPLE;  // use 6x8
+const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
 #else
-  if (dev->big_core_ids_.size() > 0) {
-    dev->arch_ = dev->archs_[dev->big_core_ids_[0]];
-  }
+const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
 #endif
-}
 
-void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
-  int cpu_count = arm_get_cpucount();
-  L1_cache_.resize(cpu_count);
-  L2_cache_.resize(cpu_count);
-  L3_cache_.resize(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    L1_cache_[i] = l1size;
-    L2_cache_[i] = l2size;
-    L3_cache_[i] = l3size;
-  }
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-
-void DeviceInfo::BindDev() {
-#ifdef ARM_WITH_OMP
-  int num_threads = active_ids_.size();
-  omp_set_num_threads(num_threads);
+int get_cpu_num() {
 #ifdef LITE_WITH_LINUX
-  std::vector<int> ssarets;
-  for (int j = 0; j < num_threads; ++j) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(active_ids_);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
-      return;
+  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
+  int max_cpu_num = 20;
+  int cpu_num = 0;
+  for (int i = 0; i < max_cpu_num; ++i) {
+    char path[256];
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
+    FILE* fp = fopen(path, "rb");
+    if (!fp) {
+      break;
     }
+    cpu_num++;
+    fclose(fp);
   }
-#endif  // LITE_WITH_LINUX
-#else   // ARM_WITH_OMP
-#ifdef LITE_WITH_LINUX
-  std::vector<int> cpuid1;
-  cpuid1.push_back(active_ids_[0]);
-  int ssaret = set_sched_affinity(cpuid1);
-  if (ssaret != 0) {
-    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
-    return;
-  }
-#endif  // LITE_WITH_LINUX
-#endif  // ARM_WITH_OMP
-}
-
-void DeviceInfo::SetRunMode(PowerMode mode, int threads) {
-  int big_core_size = big_core_ids_.size();
-  int small_core_size = little_core_ids_.size();
-  if (threads > big_core_size + small_core_size) {
-    threads = big_core_size + small_core_size;
+  if (cpu_num < 1) {
+    cpu_num = 1;
   }
-#ifdef ARM_WITH_OMP
-  count_++;
-  int shift_num = (count_ / 10) % big_core_size;
-  switch (mode) {
-    case LITE_POWER_FULL:
-      mode_ = mode;
-      active_ids_.clear();
-      for (int i = 0; i < threads; ++i) {
-        if (i < big_core_size) {
-          active_ids_.push_back(big_core_ids_[i]);
-        } else {
-          active_ids_.push_back(little_core_ids_[i - big_core_size]);
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_HIGH;
-        if (threads > big_core_size) {
-          LOG(ERROR) << "threads: " << threads
-                     << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(big_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
-        if (threads > small_core_size) {
-          active_ids_ = little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(little_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_NO_BIND:
-      mode_ = LITE_POWER_NO_BIND;
-      active_ids_.clear();
-      if (threads > core_ids_.size()) {
-        active_ids_.resize(core_ids_.size());
-      } else {
-        active_ids_.resize(threads);
-      }
-      break;
-    case LITE_POWER_RAND_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_RAND_HIGH;
-        if (threads > big_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                big_core_ids_[(i + shift_num) % big_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(WARNING)
-            << "HIGH POWER MODE is not support, switch to little cores.";
-        if (threads > small_core_size) {
-          active_ids_ = little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_RAND_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_RAND_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                little_core_ids_[(i + shift_num) % small_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
-        if (threads > big_core_size) {
-          active_ids_ = big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-  }
-  //! fix multi-threads LITE_POWER_HIGH mode
-  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
-    int threads = active_ids_.size();
-    omp_set_num_threads(threads);
-  } else {
-    if (check_online(active_ids_)) {
-      BindDev();
-    } else {
-      LOG(WARNING) << "core id " << active_ids_[0]
-                   << " is offline, switch to NO BIND MODE";
-      int threads = active_ids_.size();
-      omp_set_num_threads(threads);
-    }
+  return cpu_num;
+#elif defined(TARGET_IOS)
+  int cpu_num = 0;
+  size_t len = sizeof(cpu_num);
+  sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
+  if (cpu_num < 1) {
+    cpu_num = 1;
   }
+  return cpu_num;
 #else
-  if (big_core_size > 0) {
-    active_ids_ = {big_core_ids_[0]};
-  } else {
-    active_ids_ = {0};
-  }
+  return 1;
 #endif
-  //! alloc memory for sgemm in this context
-  int temp_mem_size = L2_cache_[active_ids_[0]] / sizeof(float);
-  workspace_.Resize({temp_mem_size});
-  arch_ = archs_[active_ids_[0]];
-}
-
-bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
-  auto count = dims.product();
-  auto old = workspace_.dims();
-  if (count == old.product()) {
-    return false;
-  }
-
-  workspace_.Resize({static_cast<int64_t>(
-      count + L2_cache_[active_ids_[0]] / sizeof(float))});
-  return true;
-}
-
-// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
-void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  std::vector<int>* cache;
-  switch (cache_id) {
-    case 0:
-      cache = &cpu_info->L1_cache_;
-      break;
-    case 1:
-      cache = &cpu_info->L2_cache_;
-      break;
-    case 2:
-      cache = &cpu_info->L3_cache_;
-      break;
-    default:
-      break;
-  }
-  int core_num = cpu_info->compute_core_num_;
-  cache->resize(core_num);
-  if (argc == 1) {
-    int cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
-      (*cache)[i] = cache_size;
-    }
-  } else {
-    int big_core_num = cpu_info->big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
-    int big_core_cache_size = va_arg(arg_ptr, int);
-    int little_core_cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < big_core_num; ++i) {
-      (*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      (*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size;
-    }
-  }
-  va_end(arg_ptr);
-}
-
-void set_arch_info(DeviceInfo* cpu_info, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  int core_num = cpu_info->compute_core_num_;
-  cpu_info->archs_.resize(core_num);
-  if (argc == 1) {
-    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
-      cpu_info->archs_[i] = arch;
-    }
-  } else {
-    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    int big_core_num = cpu_info->big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
-    for (int i = 0; i < big_core_num; ++i) {
-      cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch;
-    }
-  }
-  va_end(arg_ptr);
 }
 
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) {
-  /* Snapdragon */
-  if (hardware_name.find("SDM845") != std::string::npos) {  // 845
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024);
-    set_cache_info(cpu_info, 2, 1, 2048 * 1024);
-    return true;
-
-  } else if (hardware_name.find("SDM710") != std::string::npos) {  // 710
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
-    return true;
-  } else if (hardware_name.find("MSM8998") != std::string::npos) {  // 835
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024,
-                   /*real cache size is 2M, while that will get bad performace
-                      on conv3x3s1 or gemm, set to 1M or 512K*/
-                   1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8996") != std::string::npos) {  // 820
-    cpu_info->compute_core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {2, 3};
-    cpu_info->little_core_ids_ = {0, 1};
-    cpu_info->cluster_ids_ = {1, 1, 0, 0};
-    set_arch_info(cpu_info, 1, kA72);
-    set_cache_info(cpu_info, 0, 1, 24 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("SDM660") != std::string::npos ||
-             hardware_name.find("SDM636") != std::string::npos) {  // 660, 636
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA73);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8976") != std::string::npos) {  // 652,653
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8953") != std::string::npos) {  // 625
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8939") != std::string::npos) {  // 615
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
-    set_arch_info(cpu_info, 1, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
-    return true;
-
-    /* MediaTek */
-
-  } else if (hardware_name.find("MT6797") !=
-             std::string::npos) {  // X20/X23/X25/X27
-    cpu_info->compute_core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    cpu_info->big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MT6799") != std::string::npos) {  // X30
-    cpu_info->compute_core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    cpu_info->big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6795") != std::string::npos ||
-             hardware_name.find("MT6762") != std::string::npos ||
-             hardware_name.find("MT6755T") != std::string::npos ||
-             hardware_name.find("MT6755S") != std::string::npos ||
-             hardware_name.find("MT6753") != std::string::npos ||
-             hardware_name.find("MT6752") != std::string::npos ||
-             hardware_name.find("MT6750") != std::string::npos) {
-    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6758") != std::string::npos ||
-             hardware_name.find("MT6757") != std::string::npos ||
-             hardware_name.find("MT6763") != std::string::npos ||
-             hardware_name.find("MT6755M") != std::string::npos ||
-             hardware_name.find("MT6755") !=
-                 std::string::npos) {  // P30, P20/P25, P23, P10
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6771") != std::string::npos) {  // P60
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6765") != std::string::npos ||
-             hardware_name.find("MT6739") != std::string::npos ||
-             hardware_name.find("MT6738") != std::string::npos ||
-             hardware_name.find("MT6737") !=
-                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
-    cpu_info->compute_core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {0, 0, 0, 0};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-  }
-  return false;
-}
-
-size_t arm_get_meminfo() {
+size_t get_mem_size() {
 #ifdef LITE_WITH_LINUX
   // get cpu count from /proc/cpuinfo
   FILE* fp = fopen("/proc/meminfo", "rb");
   if (!fp) {
     return 1;
   }
-
   size_t memsize = 0;
   char line[1024];
   while (!feof(fp)) {
@@ -589,52 +96,18 @@ size_t arm_get_meminfo() {
     }
     sscanf(s, "MemTotal:        %d kB", &memsize);
   }
-
   fclose(fp);
-
   return memsize;
 #elif defined(TARGET_IOS)
   // to be implemented
   printf("not implemented\n");
-  return 0;
-#endif
-}
-
-int arm_get_cpucount() {
-#ifdef LITE_WITH_LINUX
-  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_count = 20;
-  int count = 0;
-  for (int i = 0; i < max_cpu_count; ++i) {
-    char path[256];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
-    FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      break;
-    }
-    count++;
-    fclose(fp);
-  }
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#elif defined(TARGET_IOS)
-  int count = 0;
-  size_t len = sizeof(count);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#else
-  return 1;
 #endif
+  return 0;
 }
 
-void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
-#ifdef LITE_WITH_LINUX
+void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
   archs->clear();
+#ifdef LITE_WITH_LINUX
   //! get CPU ARCH
   FILE* fp = fopen("/proc/cpuinfo", "rb");
   if (!fp) {
@@ -668,6 +141,29 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
         case 0xd0a:
           archs->push_back(kA75);
           break;
+        case 0xd40:
+          archs->push_back(kA76);
+          break;
+        case 0x804:
+          // 855
+          archs->push_back(kA76);
+          break;
+        case 0x805:
+          // 855
+          archs->push_back(kA55);
+          break;
+        case 0x802:
+          // 845
+          archs->push_back(kA75);
+          break;
+        case 0x803:
+          // 845
+          archs->push_back(kA55);
+          break;
+        case 0x801:
+          // 835
+          archs->push_back(kA73);
+          break;
         case 0x800:
           // 835
           archs->push_back(kA73);
@@ -677,49 +173,31 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
           archs->push_back(kA72);
           break;
         default:
-          LOG(ERROR) << "unknow type";
+          LOG(ERROR) << "Unknow cpu arch: " << arch_id;
           archs->push_back(kARMArch_UNKOWN);
       }
     }
   }
   fclose(fp);
-  int cpu_count = arm_get_cpucount();
-  if (archs->size() < cpu_count) {
-    for (int i = archs->size(); i < cpu_count; ++i) {
+  if (archs->size() < cpu_num) {
+    for (int i = archs->size(); i < cpu_num; ++i) {
       archs->push_back(archs->at(i - 1));
     }
   }
-#endif
-#ifdef TARGET_IOS
-  int cpu_count = arm_get_cpucount();
-  for (int i = 0; i < cpu_count; ++i) {
+#elif defined(TARGET_IOS)
+  for (int i = 0; i < cpu_num; ++i) {
     archs->push_back(APPLE);
   }
+#else
+  for (int i = 0; i < cpu_num; ++i) {
+    archs->push_back(kARMArch_UNKOWN);
+  }
 #endif
 }
 
 #ifdef LITE_WITH_LINUX
 
-void set_default_cache(DeviceInfo* dev) {
-  int cpu_count = arm_get_cpucount();
-  dev->L1_cache_.resize(cpu_count);
-  dev->L2_cache_.resize(cpu_count);
-  dev->L3_cache_.resize(cpu_count);
-#ifdef TARGET_IOS
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 64 * 1024;
-    dev->L2_cache_[i] = 2048 * 1024;
-    dev->L3_cache_[i] = 0;
-  }
-#else
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 32 * 1024;
-    dev->L2_cache_[i] = 512 * 1024;
-    dev->L3_cache_[i] = 0;
-  }
-#endif
-}
-std::string arm_get_cpu_name() {
+std::string get_cpu_name() {
   FILE* fp = fopen("/proc/cpuinfo", "rb");
   if (!fp) {
     return "";
@@ -739,122 +217,163 @@ std::string arm_get_cpu_name() {
   return "";
 }
 
-int get_max_freq_khz(int cpuid) {
+void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
+  *max_freq = 0;
+  *min_freq = 0;
   // first try, for all possible cpu
   char path[256];
   snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
-
+           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
   FILE* fp = fopen(path, "rb");
-
   if (!fp) {
     // second try, for online cpu
     snprintf(path, sizeof(path),
              "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
-             cpuid);
+             cpu_id);
     fp = fopen(path, "rb");
-
     if (!fp) {
       // third try, for online cpu
+      // get max_freq
       snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
+               cpu_id);
       fp = fopen(path, "rb");
-
       if (!fp) {
-        return -1;
+        return;
       }
-
-      int max_freq_khz = -1;
-      fscanf(fp, "%d", &max_freq_khz);
-
+      fscanf(fp, "%d", max_freq);
       fclose(fp);
-
-      return max_freq_khz;
+      // get min_freq
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
+               cpu_id);
+      fp = fopen(path, "rb");
+      if (!fp) {
+        return;
+      }
+      fscanf(fp, "%d", min_freq);
+      fclose(fp);
+      return;
     }
   }
-
-  int max_freq_khz = 0;
+  *min_freq = std::numeric_limits<int>::max();
   while (!feof(fp)) {
-    int freq_khz = 0;
-    int nscan = fscanf(fp, "%d %*d", &freq_khz);
+    int freq = 0;
+    int nscan = fscanf(fp, "%d %*d", &freq);
     if (nscan != 1) {
       break;
     }
-
-    if (freq_khz > max_freq_khz) {
-      max_freq_khz = freq_khz;
+    if (freq > *max_freq) {
+      *max_freq = freq;
+    }
+    if (freq < *min_freq) {
+      *min_freq = freq;
     }
   }
-
   fclose(fp);
-
-  return max_freq_khz;
 }
 
-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids) {
-  if (cpu_count == 0) {
-    return 0;
+void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
+                            std::vector<int>* cpu_ids,
+                            std::vector<int>* cluster_ids) {
+  int cpu_num = max_freqs.size();
+  if (cpu_num == 0) {
+    return;
   }
-
-  cpuids->resize(cpu_count);
-  cluster_ids->resize(cpu_count);
-
-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
+  cpu_ids->resize(cpu_num);
+  cluster_ids->resize(cpu_num);
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
   }
-
   // sort cpuid as big core first
   // simple bubble sort
-
-  for (int i = 0; i < cpu_count; i++) {
-    for (int j = i + 1; j < cpu_count; j++) {
-      if (cpu_freq[i] < cpu_freq[j]) {
+  for (int i = 0; i < cpu_num; i++) {
+    for (int j = i + 1; j < cpu_num; j++) {
+      if (max_freqs[i] < max_freqs[j]) {
         // swap
-        int tmp = cpuids->at(i);
-        cpuids->at(i) = cpuids->at(j);
-        cpuids->at(j) = tmp;
+        int tmp = cpu_ids->at(i);
+        cpu_ids->at(i) = cpu_ids->at(j);
+        cpu_ids->at(j) = tmp;
       }
     }
   }
   // SMP
-  int mid_max_freq_khz =
-      (cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
+  int mid_max_freq =
+      (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;
 
-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
-    if (cpu_freq[i] >= mid_max_freq_khz) {
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
+    if (max_freqs[i] >= mid_max_freq) {
       cluster_ids->at(i) = 0;
     } else {
       cluster_ids->at(i) = 1;
     }
   }
-  return 0;
 }
 
-int check_online(const std::vector<int>& core_ids) {
-  if (core_ids.size() == 0) {
-    return 0;
+void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
+                        int* l3_cache_size) {
+  int max_cache_idx_num = 10;
+  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
+  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
+  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
+  for (int i = 0; i < max_cache_idx_num; i++) {
+    char path[256];
+    snprintf(path, sizeof(path),
+             "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
+    FILE* fp = fopen(path, "rb");
+    if (fp) {
+      int level = -1;
+      fscanf(fp, "%d", &level);
+      fclose(fp);
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
+      fp = fopen(path, "rb");
+      if (fp) {
+        int size = -1;
+        fscanf(fp, "%d", &size);
+        fclose(fp);
+        if (size >= 0) {
+          if (level == 1) {
+            *l1_cache_size = size * 1024;
+          } else if (level == 2) {
+            *l2_cache_size = size * 1024;
+          } else if (level == 3) {
+            *l3_cache_size = size * 1024;
+          }
+        }
+      }
+    }
+  }
+}
+
+bool check_cpu_online(const std::vector<int>& cpu_ids) {
+  if (cpu_ids.size() == 0) {
+    return false;
   }
   char path[256];
-  int online = 1;
-  for (int i = 0; i < core_ids.size(); ++i) {
+  bool all_online = true;
+  for (int i = 0; i < cpu_ids.size(); ++i) {
     snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
-             core_ids[i]);
+             cpu_ids[i]);
     FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      return 0;
+    int is_online = 0;
+    if (fp) {
+      fscanf(fp, "%d", &is_online);
+      fclose(fp);
+    } else {
+      LOG(ERROR) << "Failed to query the online statue of CPU id:"
+                 << cpu_ids[i];
+    }
+    if (is_online == 0) {
+      all_online = false;
+      LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
     }
-    int cur_online = 0;
-    fscanf(fp, "%d", &cur_online);
-    online &= cur_online;
-    fclose(fp);
   }
-  return online;
+  return all_online;
 }
 
-int set_sched_affinity(const std::vector<int>& cpuids) {
+int set_sched_affinity(const std::vector<int>& cpu_ids) {
 // #define CPU_SETSIZE 1024
 // #define __NCPUBITS  (8 * sizeof (unsigned long))
 // typedef struct
@@ -870,20 +389,569 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
 #endif
   cpu_set_t mask;
   CPU_ZERO(&mask);
-  for (int i = 0; i < cpuids.size(); i++) {
-    CPU_SET(cpuids[i], &mask);
+  for (int i = 0; i < cpu_ids.size(); ++i) {
+    CPU_SET(cpu_ids[i], &mask);
   }
-
   int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
   if (syscallret) {
-    LOG(ERROR) << "syscall error " << syscallret;
     return -1;
   }
+  return 0;
+}
+
+bool bind_threads(const std::vector<int> cpu_ids) {
+#ifdef ARM_WITH_OMP
+  int thread_num = cpu_ids.size();
+  omp_set_num_threads(thread_num);
+  std::vector<int> ssarets;
+  for (int i = 0; i < thread_num; ++i) {
+    ssarets.push_back(0);
+  }
+#pragma omp parallel for
+  for (int i = 0; i < thread_num; i++) {
+    ssarets[i] = set_sched_affinity(cpu_ids);
+  }
+  for (int i = 0; i < thread_num; i++) {
+    if (ssarets[i] != 0) {
+      LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
+      return false;
+    }
+  }
+#else   // ARM_WITH_OMP
+  std::vector<int> first_cpu_id;
+  first_cpu_id.push_back(cpu_ids[0]);
+  int ssaret = set_sched_affinity(first_cpu_id);
+  if (ssaret != 0) {
+    LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
+    return false;
+  }
+#endif  // ARM_WITH_OMP
+}
+
+#endif  // LITE_WITH_LINUX
+
+// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
+void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
+  va_list arg_ptr;
+  va_start(arg_ptr, argc);
+  std::vector<int>* cache;
+  switch (cache_id) {
+    case 0:
+      cache = &L1_cache_;
+      break;
+    case 1:
+      cache = &L2_cache_;
+      break;
+    case 2:
+      cache = &L3_cache_;
+      break;
+    default:
+      break;
+  }
+  cache->resize(core_num_);
+  if (argc == 1) {
+    int cache_size = va_arg(arg_ptr, int);
+    for (int i = 0; i < core_num_; ++i) {
+      (*cache)[i] = cache_size;
+    }
+  } else {
+    int big_core_num = big_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
+    int big_core_cache_size = va_arg(arg_ptr, int);
+    int little_core_cache_size = va_arg(arg_ptr, int);
+    for (int i = 0; i < big_core_num; ++i) {
+      (*cache)[big_core_ids_[i]] = big_core_cache_size;
+    }
+    for (int i = 0; i < little_core_num; ++i) {
+      (*cache)[little_core_ids_[i]] = little_core_cache_size;
+    }
+  }
+  va_end(arg_ptr);
+}
+
+void DeviceInfo::SetArchInfo(int argc, ...) {
+  va_list arg_ptr;
+  va_start(arg_ptr, argc);
+  archs_.resize(core_num_);
+  if (argc == 1) {
+    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
+    for (int i = 0; i < core_num_; ++i) {
+      archs_[i] = arch;
+    }
+  } else {
+    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
+    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
+    int big_core_num = big_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
+    for (int i = 0; i < big_core_num; ++i) {
+      archs_[big_core_ids_[i]] = big_core_arch;
+    }
+    for (int i = 0; i < little_core_num; ++i) {
+      archs_[little_core_ids_[i]] = little_core_arch;
+    }
+  }
+  va_end(arg_ptr);
+}
+
+bool DeviceInfo::SetCPUInfoByName() {
+  /* Snapdragon */
+  if (dev_name_.find("SM8150") != std::string::npos) {  // 855
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA76, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM845") != std::string::npos) {  // 845
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM710") != std::string::npos) {  // 710
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {6, 7};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8998") != std::string::npos) {  // 835
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024,
+                 /*real cache size is 2M, while that will get bad performace
+                    on conv3x3s1 or gemm, set to 1M or 512K*/
+                 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8996") != std::string::npos) {  // 820
+    core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {2, 3};
+    little_core_ids_ = {0, 1};
+    cluster_ids_ = {1, 1, 0, 0};
+    SetArchInfo(1, kA72);
+    SetCacheInfo(0, 1, 24 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM660") != std::string::npos ||
+             dev_name_.find("SDM636") != std::string::npos) {  // 660, 636
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA73);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8976") != std::string::npos) {  // 652,653
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8953") != std::string::npos) {  // 625
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8939") != std::string::npos) {  // 615
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {4, 5, 6, 7};
+    cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
+    SetArchInfo(1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
+    return true;
+    /* MediaTek */
+  } else if (dev_name_.find("MT6797") !=
+             std::string::npos) {  // X20/X23/X25/X27
+    core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("MT6799") != std::string::npos) {  // X30
+    core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    return true;
+  } else if (dev_name_.find("MT6795") != std::string::npos ||
+             dev_name_.find("MT6762") != std::string::npos ||
+             dev_name_.find("MT6755T") != std::string::npos ||
+             dev_name_.find("MT6755S") != std::string::npos ||
+             dev_name_.find("MT6753") != std::string::npos ||
+             dev_name_.find("MT6752") != std::string::npos ||
+             dev_name_.find("MT6750") != std::string::npos) {
+    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  } else if (dev_name_.find("MT6758") != std::string::npos ||
+             dev_name_.find("MT6757") != std::string::npos ||
+             dev_name_.find("MT6763") != std::string::npos ||
+             dev_name_.find("MT6755M") != std::string::npos ||
+             dev_name_.find("MT6755") !=
+                 std::string::npos) {  // P30, P20/P25, P23, P10
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  } else if (dev_name_.find("MT6771") != std::string::npos) {  // P60
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    return true;
+  } else if (dev_name_.find("MT6765") != std::string::npos ||
+             dev_name_.find("MT6739") != std::string::npos ||
+             dev_name_.find("MT6738") != std::string::npos ||
+             dev_name_.find("MT6737") !=
+                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
+    core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  }
+  return false;
+}
+
+void DeviceInfo::SetCPUInfoByProb() {
+#ifdef LITE_WITH_LINUX
+  // get big.LITTLE cores by sorting CPU frequency
+  sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
+  big_core_ids_.clear();
+  little_core_ids_.clear();
+  for (int i = 0; i < cluster_ids_.size(); ++i) {
+    if (cluster_ids_[i] == 0) {
+      big_core_ids_.push_back(core_ids_[i]);
+    } else {
+      little_core_ids_.push_back(core_ids_[i]);
+    }
+  }
+  // get l1, l2, l3 cache size for each core
+  for (int i = 0; i < core_num_; i++) {
+    get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
+  }
+#endif  // LITE_WITH_LINUX
+}
+
+void DeviceInfo::RequestPowerFullMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  for (int i = 0; i < thread_num; ++i) {
+    if (i < big_core_size) {
+      active_ids_.push_back(big_core_ids_[i]);
+    } else if (i < big_core_size + little_core_size) {
+      active_ids_.push_back(little_core_ids_[i - big_core_size]);
+    }
+  }
+  mode_ = LITE_POWER_FULL;
+}
+
+void DeviceInfo::RequestPowerHighMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (big_core_size > 0) {
+    mode_ = LITE_POWER_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(ERROR) << "Request thread num: " << thread_num
+                 << ", exceed the big cores size: " << big_core_size
+                 << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_LOW;
+    LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
+    if (thread_num > little_core_size) {
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerLowMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_LOW;
+    if (thread_num > little_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
+                   << ", truncate thread num to " << little_core_size;
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  }
+}
 
+void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
+  active_ids_.clear();
+  for (int i = 0; i < thread_num; i++) {
+    active_ids_.push_back(0);
+  }
+  mode_ = LITE_POWER_NO_BIND;
+}
+
+void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
+                                          const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  if (big_core_size > 0) {
+    mode_ = LITE_POWER_RAND_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the big cores size: " << big_core_size
+                   << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_LOW;
+    LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
+    if (thread_num > little_core_size) {
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
+                                         const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_RAND_LOW;
+    if (thread_num > little_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
+                   << ", truncate thread num to " << little_core_size;
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(
+            little_core_ids_[(i + shift_num) % little_core_size]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  }
+}
+
+int DeviceInfo::Setup() {
+  core_num_ = get_cpu_num();
+  mem_size_ = get_mem_size();
+  get_cpu_arch(&archs_, core_num_);
+  // set defalut CPU info
+  SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE);
+  SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE);
+  SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE);
+#ifdef LITE_WITH_LINUX
+  // get max&min freq
+  max_freqs_.resize(core_num_);
+  min_freqs_.resize(core_num_);
+  for (int i = 0; i < core_num_; ++i) {
+    int max_freq, min_freq;
+    get_cpu_max_min_freq(i, &max_freq, &min_freq);
+    max_freqs_[i] = max_freq / 1000;
+    min_freqs_[i] = min_freq / 1000;
+  }
+  // get cache size and big.LITTLE core ids
+  dev_name_ = get_cpu_name();
+  if (!SetCPUInfoByName()) {
+    SetCPUInfoByProb();
+  }
+  // output info
+  LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
+  LOG(INFO) << "ARM multiprocessors number: " << core_num_;
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
+              << ", max freq: " << max_freqs_[i]
+              << ", min freq: " << min_freqs_[i]
+              << ", cluster ID: " << cluster_ids_[core_ids_[i]]
+              << ", CPU ARCH: A" << archs_[i];
+  }
+  LOG(INFO) << "L1 DataCache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L1_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "L2 Cache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L2_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "Total memory: " << mem_size_ << "KB";
+#endif
+  // set default run mode
+  SetRunMode(LITE_POWER_NO_BIND, 1);  // use single thread by default
   return 0;
 }
 
+void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
+#ifdef ARM_WITH_OMP
+  thread_num = std::min(thread_num, core_num_);
+#else
+  thread_num = 1;  // force thread_num to 1 if OpenMP is disabled
+#endif
+#ifdef LITE_WITH_LINUX
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  int big_little_core_size = big_core_size + little_core_size;
+  thread_num = std::min(thread_num, big_little_core_size);
+  count_++;
+  int shift_num = (count_ / 10) % big_core_size;
+  switch (mode) {
+    case LITE_POWER_FULL:
+      RequestPowerFullMode(thread_num);
+      break;
+    case LITE_POWER_HIGH:
+      RequestPowerHighMode(thread_num);
+      break;
+    case LITE_POWER_LOW:
+      RequestPowerLowMode(thread_num);
+      break;
+    case LITE_POWER_NO_BIND:
+      RequestPowerNoBindMode(thread_num);
+      break;
+    case LITE_POWER_RAND_HIGH:
+      RequestPowerRandHighMode(shift_num, thread_num);
+      break;
+    case LITE_POWER_RAND_LOW:
+      RequestPowerRandLowMode(shift_num, thread_num);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported power mode: " << mode;
+      break;
+  }
+  if (active_ids_.size() == 0) {
+    active_ids_.push_back(0);
+  }
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
+  if (mode_ != LITE_POWER_NO_BIND) {
+    if (check_cpu_online(active_ids_)) {
+      bind_threads(active_ids_);
+    } else {
+      LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
+      mode_ = LITE_POWER_NO_BIND;
+    }
+  }
+#else  // LITE_WITH_LINUX
+  // only LITE_POWER_NO_BIND is supported in other OS
+  RequestPowerNoBindMode(thread_num);
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
 #endif  // LITE_WITH_LINUX
+  //! alloc memory for sgemm in this context
+  workspace_.Resize(
+      {static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
+  arch_ = archs_[active_ids_[0]];
+}
+
+void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
+  SetCacheInfo(0, l1size);
+  SetCacheInfo(1, l2size);
+  SetCacheInfo(2, l3size);
+  workspace_.Resize({2 * (l1size + l2size)});
+}
+
+bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
+  auto count = dims.product();
+  auto old = workspace_.dims();
+  if (count == old.product()) {
+    return false;
+  }
+  workspace_.Resize({static_cast<int64_t>(
+      count + L2_cache_[active_ids_[0]] / sizeof(float))});
+  return true;
+}
 
 #endif  // LITE_WITH_ARM
 
diff --git a/paddle/fluid/lite/core/cpu_info.h b/paddle/fluid/lite/core/cpu_info.h
index b8c6ae95d..3c89d5eb4 100644
--- a/paddle/fluid/lite/core/cpu_info.h
+++ b/paddle/fluid/lite/core/cpu_info.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <cstdarg>
 #include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/lite_tensor.h"
@@ -47,92 +48,73 @@ typedef enum {
 
 class DeviceInfo {
  public:
-  int idx_;
-  int max_freq_;
-  int min_freq_;
-  int generate_arch_;
-  int compute_core_num_;
-  int max_memory_;
-  int sharemem_size_;
-
-  std::string device_name_;
-  std::string compute_ability_;
-
-  std::vector<int> L1_cache_;
-  std::vector<int> L2_cache_;
-  std::vector<int> L3_cache_;
-  std::vector<int> core_ids_;
-  std::vector<int> big_core_ids_;
-  std::vector<int> little_core_ids_;
-  std::vector<int> cluster_ids_;
-  std::vector<ARMArch> archs_;
-
-  ARMArch arch_;
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
-
   static DeviceInfo& Global() {
     static auto* x = new DeviceInfo;
     return *x;
   }
 
-  static void Init() {
-    auto& info = Global();
-    InitInternal(&info);
+  static int Init() {
+    static int ret = Global().Setup();
+    return ret;
   }
 
-  void SetRunMode(PowerMode mode, int threads);
+  int Setup();
+
+  void SetRunMode(PowerMode mode, int thread_num);
   void SetCache(int l1size, int l2size, int l3size);
   void SetArch(ARMArch arch) { arch_ = arch; }
-  void BindDev();
 
   PowerMode mode() const { return mode_; }
   int threads() const { return active_ids_.size(); }
   ARMArch arch() const { return arch_; }
+  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
+  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
+  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
 
   template <typename T>
   T* workspace_data() {
     return workspace_.mutable_data<T>();
   }
-
-  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
-  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
-  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
   bool ExtendWorkspace(DDimLite dims);
 
  private:
-  DeviceInfo() = default;
-  static void InitInternal(DeviceInfo* dev);
-};
-
-size_t arm_get_meminfo();
-
-int arm_get_cpucount();
-
-void arm_get_cpu_arch(std::vector<ARMArch>* archs);
-
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
+  int core_num_;
+  std::vector<int> max_freqs_;
+  std::vector<int> min_freqs_;
+  int mem_size_;
+  std::string dev_name_;
 
-#ifdef LITE_WITH_LINUX
-
-void set_default_cache(DeviceInfo* dev);
-
-std::string arm_get_cpu_name();
+  std::vector<int> L1_cache_;
+  std::vector<int> L2_cache_;
+  std::vector<int> L3_cache_;
+  std::vector<int> core_ids_;
+  std::vector<int> big_core_ids_;
+  std::vector<int> little_core_ids_;
+  std::vector<int> cluster_ids_;
+  std::vector<ARMArch> archs_;
 
-int get_max_freq_khz(int cpuid);
+  ARMArch arch_;
+  // LITE_POWER_HIGH stands for using big cores,
+  // LITE_POWER_LOW stands for using small core,
+  // LITE_POWER_FULL stands for using all cores
+  PowerMode mode_;
+  std::vector<int> active_ids_;
+  TensorLite workspace_;
+  int64_t count_{0};
 
-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids);
-int check_online(const std::vector<int>& core_ids);
-int set_sched_affinity(const std::vector<int>& cpuids);
+  void SetCacheInfo(int cache_id, int argc, ...);
+  void SetArchInfo(int argc, ...);
+  bool SetCPUInfoByName();
+  void SetCPUInfoByProb();
+  void RequestPowerFullMode(const int thread_num);
+  void RequestPowerHighMode(const int thread_num);
+  void RequestPowerLowMode(const int thread_num);
+  void RequestPowerNoBindMode(const int thread_num);
+  void RequestPowerRandHighMode(const int shift_num, const int thread_num);
+  void RequestPowerRandLowMode(const int shift_num, const int thread_num);
 
-#endif  // LITE_WITH_LINUX
+  DeviceInfo() = default;
+};
 
 #endif  // LITE_WITH_ARM
 
-- 
GitLab