ARM cpu_info refine

test=develop

ARM cpu_info refine
test=develop
ce46ef22 · hong19860320 · 251255bc · ce46ef22 · ce46ef22 · ce46ef22
3 changed file
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -67,7 +67,7 @@ class Context<TargetType::kARM> {
  ARMContext& operator=(const ARMContext& ctx) {}
  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() {}
+  void InitOnce() { DeviceInfo::Init(); }
  void CopyShared(const ARMContext* ctx) {}
@@ -78,20 +78,19 @@ class Context<TargetType::kARM> {
    return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
  }
  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
-  void BindDev() { return DeviceInfo::Global().BindDev(); }
  PowerMode mode() const { return DeviceInfo::Global().mode(); }
  int threads() const { return DeviceInfo::Global().threads(); }
  ARMArch arch() const { return DeviceInfo::Global().arch(); }
+  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
+  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
+  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
  template <typename T>
  T* workspace_data() {
    return DeviceInfo::Global().workspace_data<T>();
  }
-  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
-  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
-  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
  bool ExtendWorkspace(DDimLite dims) {
    return DeviceInfo::Global().ExtendWorkspace(dims);
  }

--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -29,7 +29,8 @@
 #include <omp.h>
 #endif
-#include <cstdarg>
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/lite/core/cpu_info.h"
 namespace paddle {
@@ -37,853 +38,920 @@ namespace lite {
 #ifdef LITE_WITH_ARM
-void DeviceInfo::InitInternal(DeviceInfo* dev) {
+#ifdef TARGET_IOS
-  set_default_cache(dev);
+const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
-  dev->compute_core_num_ = arm_get_cpucount();
+const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
-  dev->max_memory_ = arm_get_meminfo();
+const int DEFAULT_L3_CACHE_SIZE = 0;
+#else
+const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
+#endif
-// get max freq
+int get_cpu_num() {
 #ifdef LITE_WITH_LINUX
-  std::vector<int> max_freq(dev->compute_core_num_);
+  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
+  int max_cpu_num = 20;
-    max_freq[i] = get_max_freq_khz(i) / 1000;
+  int cpu_num = 0;
-  }
+  for (int i = 0; i < max_cpu_num; ++i) {
-  std::string cpu_name = arm_get_cpu_name();
+    char path[256];
-  if (get_cpu_info_from_name(dev, cpu_name) != true) {
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
-    arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_,
+    FILE* fp = fopen(path, "rb");
-                                    max_freq, &dev->cluster_ids_);
+    if (!fp) {
-    dev->big_core_ids_.clear();
+      break;
-    dev->little_core_ids_.clear();
+    }
-    for (int i = 0; i < dev->cluster_ids_.size(); ++i) {
+    cpu_num++;
-      if (dev->cluster_ids_[i] == 0) {
+    fclose(fp);
-        dev->big_core_ids_.push_back(dev->core_ids_[i]);
-      } else {
-        dev->little_core_ids_.push_back(dev->core_ids_[i]);
  }
+  if (cpu_num < 1) {
+    cpu_num = 1;
  }
-    arm_get_cpu_arch(&dev->archs_);
+  return cpu_num;
+#elif defined(TARGET_IOS)
+  int cpu_num = 0;
+  size_t len = sizeof(cpu_num);
+  sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
+  if (cpu_num < 1) {
+    cpu_num = 1;
  }
+  return cpu_num;
+#else
+  return 1;
+#endif
+}
-  LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_;
+size_t get_mem_size() {
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
+#ifdef LITE_WITH_LINUX
-    LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i]
+  // get cpu count from /proc/cpuinfo
-              << ", frequence: " << max_freq[i]
+  FILE* fp = fopen("/proc/meminfo", "rb");
-              << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
+  if (!fp) {
-              << ", CPU ARCH: A" << dev->archs_[i];
+    return 1;
  }
-  VLOG(1) << "L1 DataCache size is: ";
+  size_t memsize = 0;
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
+  char line[1024];
-    VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
+  while (!feof(fp)) {
+    char* s = fgets(line, 1024, fp);
+    if (!s) {
+      break;
    }
-  VLOG(1) << "L2 Cache size is: ";
+    sscanf(s, "MemTotal:        %d kB", &memsize);
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
  }
-  VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
+  fclose(fp);
+  return memsize;
+#elif defined(TARGET_IOS)
+  // to be implemented
+  printf("not implemented\n");
+#endif
+  return 0;
+}
-  dev->max_freq_ = max_freq[0];
+void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
-  for (int j = 1; j < dev->compute_core_num_; ++j) {
+  archs->clear();
-    if (dev->max_freq_ < max_freq[j]) {
+#ifdef LITE_WITH_LINUX
-      dev->max_freq_ = max_freq[j];
+  //! get CPU ARCH
+  FILE* fp = fopen("/proc/cpuinfo", "rb");
+  if (!fp) {
+    return;
+  }
+  char line[1024];
+  while (!feof(fp)) {
+    char* s = fgets(line, 1024, fp);
+    if (!s) {
+      break;
+    }
+    if (strstr(line, "part") != NULL) {
+      int arch_id = 0;
+      sscanf(s, "CPU part\t: %x", &arch_id);
+      switch (arch_id) {
+        case 0xd03:
+          archs->push_back(kA53);
+          break;
+        case 0xd05:
+          archs->push_back(kA55);
+          break;
+        case 0xd07:
+          archs->push_back(kA57);
+          break;
+        case 0xd08:
+          archs->push_back(kA72);
+          break;
+        case 0xd09:
+          archs->push_back(kA73);
+          break;
+        case 0xd0a:
+          archs->push_back(kA75);
+          break;
+        case 0xd40:
+          archs->push_back(kA76);
+          break;
+        case 0x804:
+          // 855
+          archs->push_back(kA76);
+          break;
+        case 0x805:
+          // 855
+          archs->push_back(kA55);
+          break;
+        case 0x802:
+          // 845
+          archs->push_back(kA75);
+          break;
+        case 0x803:
+          // 845
+          archs->push_back(kA55);
+          break;
+        case 0x801:
+          // 835
+          archs->push_back(kA73);
+          break;
+        case 0x800:
+          // 835
+          archs->push_back(kA73);
+          break;
+        case 0x205:
+          // 820
+          archs->push_back(kA72);
+          break;
+        default:
+          LOG(ERROR) << "Unknow cpu arch: " << arch_id;
+          archs->push_back(kARMArch_UNKOWN);
+      }
+    }
+  }
+  fclose(fp);
+  if (archs->size() < cpu_num) {
+    for (int i = archs->size(); i < cpu_num; ++i) {
+      archs->push_back(archs->at(i - 1));
    }
  }
 #elif defined(TARGET_IOS)
-  arm_get_cpu_arch(&dev->archs_);
+  for (int i = 0; i < cpu_num; ++i) {
-#endif
+    archs->push_back(APPLE);
-  dev->active_ids_ = {0};
+  }
-  dev->mode_ = LITE_POWER_HIGH;
-  dev->workspace_.Resize({static_cast<int64_t>(
-      dev->L2_cache_[dev->active_ids_[0]] / sizeof(float))});
-#ifdef TARGET_IOS
-  dev->arch_ = APPLE;  // use 6x8
 #else
-  if (dev->big_core_ids_.size() > 0) {
+  for (int i = 0; i < cpu_num; ++i) {
-    dev->arch_ = dev->archs_[dev->big_core_ids_[0]];
+    archs->push_back(kARMArch_UNKOWN);
  }
 #endif
 }
-void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
-  int cpu_count = arm_get_cpucount();
-  L1_cache_.resize(cpu_count);
-  L2_cache_.resize(cpu_count);
-  L3_cache_.resize(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    L1_cache_[i] = l1size;
-    L2_cache_[i] = l2size;
-    L3_cache_[i] = l3size;
-  }
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-void DeviceInfo::BindDev() {
-#ifdef ARM_WITH_OMP
-  int num_threads = active_ids_.size();
-  omp_set_num_threads(num_threads);
 #ifdef LITE_WITH_LINUX
-  std::vector<int> ssarets;
-  for (int j = 0; j < num_threads; ++j) {
+std::string get_cpu_name() {
-    ssarets.push_back(0);
+  FILE* fp = fopen("/proc/cpuinfo", "rb");
-  }
+  if (!fp) {
-#pragma omp parallel for
+    return "";
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(active_ids_);
  }
-  for (int i = 0; i < num_threads; i++) {
+  char line[1024];
-    if (ssarets[i] != 0) {
+  while (!feof(fp)) {
-      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
+    char* s = fgets(line, 1024, fp);
-      return;
+    if (!s) {
+      break;
    }
+    if (strstr(line, "Hardware") != NULL) {
+      fclose(fp);
+      return std::string(line);
    }
-#endif  // LITE_WITH_LINUX
-#else   // ARM_WITH_OMP
-#ifdef LITE_WITH_LINUX
-  std::vector<int> cpuid1;
-  cpuid1.push_back(active_ids_[0]);
-  int ssaret = set_sched_affinity(cpuid1);
-  if (ssaret != 0) {
-    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
-    return;
  }
-#endif  // LITE_WITH_LINUX
+  fclose(fp);
-#endif  // ARM_WITH_OMP
+  return "";
 }
-void DeviceInfo::SetRunMode(PowerMode mode, int threads) {
+void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
-  int big_core_size = big_core_ids_.size();
+  *max_freq = 0;
-  int small_core_size = little_core_ids_.size();
+  *min_freq = 0;
-  if (threads > big_core_size + small_core_size) {
+  // first try, for all possible cpu
-    threads = big_core_size + small_core_size;
+  char path[256];
+  snprintf(path, sizeof(path),
+           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
+  FILE* fp = fopen(path, "rb");
+  if (!fp) {
+    // second try, for online cpu
+    snprintf(path, sizeof(path),
+             "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
+             cpu_id);
+    fp = fopen(path, "rb");
+    if (!fp) {
+      // third try, for online cpu
+      // get max_freq
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
+               cpu_id);
+      fp = fopen(path, "rb");
+      if (!fp) {
+        return;
      }
-#ifdef ARM_WITH_OMP
+      fscanf(fp, "%d", max_freq);
-  count_++;
+      fclose(fp);
-  int shift_num = (count_ / 10) % big_core_size;
+      // get min_freq
-  switch (mode) {
+      snprintf(path, sizeof(path),
-    case LITE_POWER_FULL:
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
-      mode_ = mode;
+               cpu_id);
-      active_ids_.clear();
+      fp = fopen(path, "rb");
-      for (int i = 0; i < threads; ++i) {
+      if (!fp) {
-        if (i < big_core_size) {
+        return;
-          active_ids_.push_back(big_core_ids_[i]);
-        } else {
-          active_ids_.push_back(little_core_ids_[i - big_core_size]);
      }
+      fscanf(fp, "%d", min_freq);
+      fclose(fp);
+      return;
    }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
  }
+  *min_freq = std::numeric_limits<int>::max();
+  while (!feof(fp)) {
+    int freq = 0;
+    int nscan = fscanf(fp, "%d %*d", &freq);
+    if (nscan != 1) {
      break;
-    case LITE_POWER_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_HIGH;
-        if (threads > big_core_size) {
-          LOG(ERROR) << "threads: " << threads
-                     << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(big_core_ids_[i]);
    }
+    if (freq > *max_freq) {
+      *max_freq = freq;
    }
-      } else {
+    if (freq < *min_freq) {
-        mode_ = LITE_POWER_LOW;
+      *min_freq = freq;
-        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
-        if (threads > small_core_size) {
-          active_ids_ = little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(little_core_ids_[i]);
    }
  }
+  fclose(fp);
+}
+void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
+                            std::vector<int>* cpu_ids,
+                            std::vector<int>* cluster_ids) {
+  int cpu_num = max_freqs.size();
+  if (cpu_num == 0) {
+    return;
  }
-      if (active_ids_.size() == 0) {
+  cpu_ids->resize(cpu_num);
-        active_ids_.push_back(0);
+  cluster_ids->resize(cpu_num);
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
  }
-      break;
+  // sort cpuid as big core first
-    case LITE_POWER_LOW:
+  // simple bubble sort
-      active_ids_.clear();
+  for (int i = 0; i < cpu_num; i++) {
-      if (small_core_size > 0) {
+    for (int j = i + 1; j < cpu_num; j++) {
-        mode_ = LITE_POWER_LOW;
+      if (max_freqs[i] < max_freqs[j]) {
-        if (threads > small_core_size) {
+        // swap
-          LOG(WARNING) << "threads: " << threads
+        int tmp = cpu_ids->at(i);
-                       << ", exceed the little cores size: " << small_core_size;
+        cpu_ids->at(i) = cpu_ids->at(j);
-          active_ids_ = little_core_ids_;
+        cpu_ids->at(j) = tmp;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(little_core_ids_[i]);
      }
    }
+  }
+  // SMP
+  int mid_max_freq =
+      (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
+    if (max_freqs[i] >= mid_max_freq) {
+      cluster_ids->at(i) = 0;
    } else {
-        mode_ = LITE_POWER_HIGH;
+      cluster_ids->at(i) = 1;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(big_core_ids_[i]);
    }
  }
+}
+void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
+                        int* l3_cache_size) {
+  int max_cache_idx_num = 10;
+  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
+  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
+  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
+  for (int i = 0; i < max_cache_idx_num; i++) {
+    char path[256];
+    snprintf(path, sizeof(path),
+             "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
+    FILE* fp = fopen(path, "rb");
+    if (fp) {
+      int level = -1;
+      fscanf(fp, "%d", &level);
+      fclose(fp);
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
+      fp = fopen(path, "rb");
+      if (fp) {
+        int size = -1;
+        fscanf(fp, "%d", &size);
+        fclose(fp);
+        if (size >= 0) {
+          if (level == 1) {
+            *l1_cache_size = size * 1024;
+          } else if (level == 2) {
+            *l2_cache_size = size * 1024;
+          } else if (level == 3) {
+            *l3_cache_size = size * 1024;
          }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
        }
-      break;
-    case LITE_POWER_NO_BIND:
-      mode_ = LITE_POWER_NO_BIND;
-      active_ids_.clear();
-      if (threads > core_ids_.size()) {
-        active_ids_.resize(core_ids_.size());
-      } else {
-        active_ids_.resize(threads);
      }
-      break;
-    case LITE_POWER_RAND_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_RAND_HIGH;
-        if (threads > big_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                big_core_ids_[(i + shift_num) % big_core_size]);
    }
  }
-      } else {
+}
-        mode_ = LITE_POWER_LOW;
-        LOG(WARNING)
+bool check_cpu_online(const std::vector<int>& cpu_ids) {
-            << "HIGH POWER MODE is not support, switch to little cores.";
+  if (cpu_ids.size() == 0) {
-        if (threads > small_core_size) {
+    return false;
-          active_ids_ = little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(little_core_ids_[i]);
  }
+  char path[256];
+  bool all_online = true;
+  for (int i = 0; i < cpu_ids.size(); ++i) {
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
+             cpu_ids[i]);
+    FILE* fp = fopen(path, "rb");
+    int is_online = 0;
+    if (fp) {
+      fscanf(fp, "%d", &is_online);
+      fclose(fp);
+    } else {
+      LOG(ERROR) << "Failed to query the online statue of CPU id:"
+                 << cpu_ids[i];
    }
+    if (is_online == 0) {
+      all_online = false;
+      LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
    }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_RAND_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_RAND_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                little_core_ids_[(i + shift_num) % small_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
-        if (threads > big_core_size) {
-          active_ids_ = big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(big_core_ids_[i]);
  }
+  return all_online;
+}
+int set_sched_affinity(const std::vector<int>& cpu_ids) {
+// #define CPU_SETSIZE 1024
+// #define __NCPUBITS  (8 * sizeof (unsigned long))
+// typedef struct
+// {
+//    unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
+// } cpu_set_t;
+// set affinity for thread
+#ifdef __GLIBC__
+  pid_t pid = syscall(SYS_gettid);
+#else
+  pid_t pid = gettid();
+#endif
+  cpu_set_t mask;
+  CPU_ZERO(&mask);
+  for (int i = 0; i < cpu_ids.size(); ++i) {
+    CPU_SET(cpu_ids[i], &mask);
  }
+  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
+  if (syscallret) {
+    return -1;
  }
-      if (active_ids_.size() == 0) {
+  return 0;
-        active_ids_.push_back(0);
+}
+bool bind_threads(const std::vector<int> cpu_ids) {
+#ifdef ARM_WITH_OMP
+  int thread_num = cpu_ids.size();
+  omp_set_num_threads(thread_num);
+  std::vector<int> ssarets;
+  for (int i = 0; i < thread_num; ++i) {
+    ssarets.push_back(0);
  }
-      break;
+#pragma omp parallel for
+  for (int i = 0; i < thread_num; i++) {
+    ssarets[i] = set_sched_affinity(cpu_ids);
  }
-  //! fix multi-threads LITE_POWER_HIGH mode
+  for (int i = 0; i < thread_num; i++) {
-  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
+    if (ssarets[i] != 0) {
-    int threads = active_ids_.size();
+      LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
-    omp_set_num_threads(threads);
+      return false;
-  } else {
-    if (check_online(active_ids_)) {
-      BindDev();
-    } else {
-      LOG(WARNING) << "core id " << active_ids_[0]
-                   << " is offline, switch to NO BIND MODE";
-      int threads = active_ids_.size();
-      omp_set_num_threads(threads);
    }
  }
-#else
+#else   // ARM_WITH_OMP
-  if (big_core_size > 0) {
+  std::vector<int> first_cpu_id;
-    active_ids_ = {big_core_ids_[0]};
+  first_cpu_id.push_back(cpu_ids[0]);
-  } else {
+  int ssaret = set_sched_affinity(first_cpu_id);
-    active_ids_ = {0};
+  if (ssaret != 0) {
-  }
+    LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
-#endif
-  //! alloc memory for sgemm in this context
-  int temp_mem_size = L2_cache_[active_ids_[0]] / sizeof(float);
-  workspace_.Resize({temp_mem_size});
-  arch_ = archs_[active_ids_[0]];
-}
-bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
-  auto count = dims.product();
-  auto old = workspace_.dims();
-  if (count == old.product()) {
    return false;
  }
+#endif  // ARM_WITH_OMP
-  workspace_.Resize({static_cast<int64_t>(
-      count + L2_cache_[active_ids_[0]] / sizeof(float))});
-  return true;
 }
+#endif  // LITE_WITH_LINUX
 // cache_id : 0 -> L1, 1 -> L2, 2 -> L3
-void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) {
+void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
  va_list arg_ptr;
  va_start(arg_ptr, argc);
  std::vector<int>* cache;
  switch (cache_id) {
    case 0:
-      cache = &cpu_info->L1_cache_;
+      cache = &L1_cache_;
      break;
    case 1:
-      cache = &cpu_info->L2_cache_;
+      cache = &L2_cache_;
      break;
    case 2:
-      cache = &cpu_info->L3_cache_;
+      cache = &L3_cache_;
      break;
    default:
      break;
  }
-  int core_num = cpu_info->compute_core_num_;
+  cache->resize(core_num_);
-  cache->resize(core_num);
  if (argc == 1) {
    int cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
+    for (int i = 0; i < core_num_; ++i) {
      (*cache)[i] = cache_size;
    }
  } else {
-    int big_core_num = cpu_info->big_core_ids_.size();
+    int big_core_num = big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
    int big_core_cache_size = va_arg(arg_ptr, int);
    int little_core_cache_size = va_arg(arg_ptr, int);
    for (int i = 0; i < big_core_num; ++i) {
-      (*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size;
+      (*cache)[big_core_ids_[i]] = big_core_cache_size;
    }
    for (int i = 0; i < little_core_num; ++i) {
-      (*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size;
+      (*cache)[little_core_ids_[i]] = little_core_cache_size;
    }
  }
  va_end(arg_ptr);
 }
-void set_arch_info(DeviceInfo* cpu_info, int argc, ...) {
+void DeviceInfo::SetArchInfo(int argc, ...) {
  va_list arg_ptr;
  va_start(arg_ptr, argc);
-  int core_num = cpu_info->compute_core_num_;
+  archs_.resize(core_num_);
-  cpu_info->archs_.resize(core_num);
  if (argc == 1) {
    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
+    for (int i = 0; i < core_num_; ++i) {
-      cpu_info->archs_[i] = arch;
+      archs_[i] = arch;
    }
  } else {
    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    int big_core_num = cpu_info->big_core_ids_.size();
+    int big_core_num = big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
    for (int i = 0; i < big_core_num; ++i) {
-      cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch;
+      archs_[big_core_ids_[i]] = big_core_arch;
    }
    for (int i = 0; i < little_core_num; ++i) {
-      cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch;
+      archs_[little_core_ids_[i]] = little_core_arch;
    }
  }
  va_end(arg_ptr);
 }
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) {
+bool DeviceInfo::SetCPUInfoByName() {
  /* Snapdragon */
-  if (hardware_name.find("SDM845") != std::string::npos) {  // 845
+  if (dev_name_.find("SM8150") != std::string::npos) {  // 855
-    cpu_info->compute_core_num_ = 8;
+    core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
+    SetArchInfo(2, kA76, kA55);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
-    set_cache_info(cpu_info, 2, 1, 2048 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
    return true;
+  } else if (dev_name_.find("SDM845") != std::string::npos) {  // 845
-  } else if (hardware_name.find("SDM710") != std::string::npos) {  // 710
+    core_num_ = 8;
-    cpu_info->compute_core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA75, kA55);
-    set_arch_info(cpu_info, 2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM710") != std::string::npos) {  // 710
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {6, 7};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 1024 * 1024);
    return true;
-  } else if (hardware_name.find("MSM8998") != std::string::npos) {  // 835
+  } else if (dev_name_.find("MSM8998") != std::string::npos) {  // 835
-    cpu_info->compute_core_num_ = 8;
+    core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
+    SetArchInfo(2, kA73, kA53);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024,
+    SetCacheInfo(1, 2, 1024 * 1024,
                 /*real cache size is 2M, while that will get bad performace
                    on conv3x3s1 or gemm, set to 1M or 512K*/
                 1024 * 1024);
    return true;
+  } else if (dev_name_.find("MSM8996") != std::string::npos) {  // 820
-  } else if (hardware_name.find("MSM8996") != std::string::npos) {  // 820
+    core_num_ = 4;
-    cpu_info->compute_core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
-    cpu_info->core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {2, 3};
-    cpu_info->big_core_ids_ = {2, 3};
+    little_core_ids_ = {0, 1};
-    cpu_info->little_core_ids_ = {0, 1};
+    cluster_ids_ = {1, 1, 0, 0};
-    cpu_info->cluster_ids_ = {1, 1, 0, 0};
+    SetArchInfo(1, kA72);
-    set_arch_info(cpu_info, 1, kA72);
+    SetCacheInfo(0, 1, 24 * 1024);
-    set_cache_info(cpu_info, 0, 1, 24 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
    return true;
+  } else if (dev_name_.find("SDM660") != std::string::npos ||
-  } else if (hardware_name.find("SDM660") != std::string::npos ||
+             dev_name_.find("SDM636") != std::string::npos) {  // 660, 636
-             hardware_name.find("SDM636") != std::string::npos) {  // 660, 636
+    core_num_ = 8;
-    cpu_info->compute_core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA73);
-    set_arch_info(cpu_info, 1, kA73);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
    return true;
+  } else if (dev_name_.find("MSM8976") != std::string::npos) {  // 652,653
-  } else if (hardware_name.find("MSM8976") != std::string::npos) {  // 652,653
+    core_num_ = 8;
-    cpu_info->compute_core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA72, kA53);
-    set_arch_info(cpu_info, 2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
    return true;
+  } else if (dev_name_.find("MSM8953") != std::string::npos) {  // 625
-  } else if (hardware_name.find("MSM8953") != std::string::npos) {  // 625
+    core_num_ = 8;
-    cpu_info->compute_core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
-    cpu_info->little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
-    set_arch_info(cpu_info, 1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
    return true;
+  } else if (dev_name_.find("MSM8939") != std::string::npos) {  // 615
-  } else if (hardware_name.find("MSM8939") != std::string::npos) {  // 615
+    core_num_ = 8;
-    cpu_info->compute_core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {4, 5, 6, 7};
+    cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
+    SetArchInfo(1, kA53);
-    set_arch_info(cpu_info, 1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
-    set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
    return true;
    /* MediaTek */
+  } else if (dev_name_.find("MT6797") !=
-  } else if (hardware_name.find("MT6797") !=
             std::string::npos) {  // X20/X23/X25/X27
-    cpu_info->compute_core_num_ = 10;
+    core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    cpu_info->big_core_ids_ = {8, 9};
+    big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
+    SetArchInfo(2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
+    SetCacheInfo(0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
    return true;
+  } else if (dev_name_.find("MT6799") != std::string::npos) {  // X30
-  } else if (hardware_name.find("MT6799") != std::string::npos) {  // X30
+    core_num_ = 10;
-    cpu_info->compute_core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+    big_core_ids_ = {8, 9};
-    cpu_info->big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA73, kA53);
-    set_arch_info(cpu_info, 2, kA73, kA53);
    return true;
+  } else if (dev_name_.find("MT6795") != std::string::npos ||
-  } else if (hardware_name.find("MT6795") != std::string::npos ||
+             dev_name_.find("MT6762") != std::string::npos ||
-             hardware_name.find("MT6762") != std::string::npos ||
+             dev_name_.find("MT6755T") != std::string::npos ||
-             hardware_name.find("MT6755T") != std::string::npos ||
+             dev_name_.find("MT6755S") != std::string::npos ||
-             hardware_name.find("MT6755S") != std::string::npos ||
+             dev_name_.find("MT6753") != std::string::npos ||
-             hardware_name.find("MT6753") != std::string::npos ||
+             dev_name_.find("MT6752") != std::string::npos ||
-             hardware_name.find("MT6752") != std::string::npos ||
+             dev_name_.find("MT6750") != std::string::npos) {
-             hardware_name.find("MT6750") != std::string::npos) {
    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
-    cpu_info->compute_core_num_ = 8;
+    core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
+    little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
+    SetArchInfo(1, kA53);
    return true;
+  } else if (dev_name_.find("MT6758") != std::string::npos ||
-  } else if (hardware_name.find("MT6758") != std::string::npos ||
+             dev_name_.find("MT6757") != std::string::npos ||
-             hardware_name.find("MT6757") != std::string::npos ||
+             dev_name_.find("MT6763") != std::string::npos ||
-             hardware_name.find("MT6763") != std::string::npos ||
+             dev_name_.find("MT6755M") != std::string::npos ||
-             hardware_name.find("MT6755M") != std::string::npos ||
+             dev_name_.find("MT6755") !=
-             hardware_name.find("MT6755") !=
                 std::string::npos) {  // P30, P20/P25, P23, P10
-    cpu_info->compute_core_num_ = 8;
+    core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
+    SetArchInfo(1, kA53);
    return true;
+  } else if (dev_name_.find("MT6771") != std::string::npos) {  // P60
-  } else if (hardware_name.find("MT6771") != std::string::npos) {  // P60
+    core_num_ = 8;
-    cpu_info->compute_core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
-    set_arch_info(cpu_info, 2, kA73, kA53);
    return true;
+  } else if (dev_name_.find("MT6765") != std::string::npos ||
-  } else if (hardware_name.find("MT6765") != std::string::npos ||
+             dev_name_.find("MT6739") != std::string::npos ||
-             hardware_name.find("MT6739") != std::string::npos ||
+             dev_name_.find("MT6738") != std::string::npos ||
-             hardware_name.find("MT6738") != std::string::npos ||
+             dev_name_.find("MT6737") !=
-             hardware_name.find("MT6737") !=
                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
-    cpu_info->compute_core_num_ = 4;
+    core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
+    core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {0, 0, 0, 0};
+    big_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {};
+    little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0};
+    cluster_ids_ = {0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
+    SetArchInfo(1, kA53);
    return true;
  }
  return false;
 }
-size_t arm_get_meminfo() {
+void DeviceInfo::SetCPUInfoByProb() {
 #ifdef LITE_WITH_LINUX
-  // get cpu count from /proc/cpuinfo
+  // get big.LITTLE cores by sorting CPU frequency
-  FILE* fp = fopen("/proc/meminfo", "rb");
+  sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
-  if (!fp) {
+  big_core_ids_.clear();
-    return 1;
+  little_core_ids_.clear();
+  for (int i = 0; i < cluster_ids_.size(); ++i) {
+    if (cluster_ids_[i] == 0) {
+      big_core_ids_.push_back(core_ids_[i]);
+    } else {
+      little_core_ids_.push_back(core_ids_[i]);
    }
-  size_t memsize = 0;
-  char line[1024];
-  while (!feof(fp)) {
-    char* s = fgets(line, 1024, fp);
-    if (!s) {
-      break;
  }
-    sscanf(s, "MemTotal:        %d kB", &memsize);
+  // get l1, l2, l3 cache size for each core
+  for (int i = 0; i < core_num_; i++) {
+    get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
  }
+#endif  // LITE_WITH_LINUX
-  fclose(fp);
-  return memsize;
-#elif defined(TARGET_IOS)
-  // to be implemented
-  printf("not implemented\n");
-  return 0;
-#endif
 }
-int arm_get_cpucount() {
+void DeviceInfo::RequestPowerFullMode(const int thread_num) {
-#ifdef LITE_WITH_LINUX
+  int big_core_size = big_core_ids_.size();
-  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
+  int little_core_size = little_core_ids_.size();
-  int max_cpu_count = 20;
+  active_ids_.clear();
-  int count = 0;
+  for (int i = 0; i < thread_num; ++i) {
-  for (int i = 0; i < max_cpu_count; ++i) {
+    if (i < big_core_size) {
-    char path[256];
+      active_ids_.push_back(big_core_ids_[i]);
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
+    } else if (i < big_core_size + little_core_size) {
-    FILE* fp = fopen(path, "rb");
+      active_ids_.push_back(little_core_ids_[i - big_core_size]);
-    if (!fp) {
-      break;
-    }
-    count++;
-    fclose(fp);
-  }
-  if (count < 1) {
-    count = 1;
    }
-  return count;
-#elif defined(TARGET_IOS)
-  int count = 0;
-  size_t len = sizeof(count);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  if (count < 1) {
-    count = 1;
  }
-  return count;
+  mode_ = LITE_POWER_FULL;
-#else
-  return 1;
-#endif
 }
-void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
+void DeviceInfo::RequestPowerHighMode(const int thread_num) {
-#ifdef LITE_WITH_LINUX
+  int big_core_size = big_core_ids_.size();
-  archs->clear();
+  int little_core_size = little_core_ids_.size();
-  //! get CPU ARCH
+  active_ids_.clear();
-  FILE* fp = fopen("/proc/cpuinfo", "rb");
+  if (big_core_size > 0) {
-  if (!fp) {
+    mode_ = LITE_POWER_HIGH;
-    return;
+    if (thread_num > big_core_size) {
-  }
+      LOG(ERROR) << "Request thread num: " << thread_num
-  char line[1024];
+                 << ", exceed the big cores size: " << big_core_size
-  while (!feof(fp)) {
+                 << ", truncate thread num to " << big_core_size;
-    char* s = fgets(line, 1024, fp);
+      active_ids_ = big_core_ids_;
-    if (!s) {
+    } else {
-      break;
+      for (int i = 0; i < thread_num; ++i) {
-    }
+        active_ids_.push_back(big_core_ids_[i]);
-    if (strstr(line, "part") != NULL) {
-      int arch_id = 0;
-      sscanf(s, "CPU part\t: %x", &arch_id);
-      switch (arch_id) {
-        case 0xd03:
-          archs->push_back(kA53);
-          break;
-        case 0xd05:
-          archs->push_back(kA55);
-          break;
-        case 0xd07:
-          archs->push_back(kA57);
-          break;
-        case 0xd08:
-          archs->push_back(kA72);
-          break;
-        case 0xd09:
-          archs->push_back(kA73);
-          break;
-        case 0xd0a:
-          archs->push_back(kA75);
-          break;
-        case 0x800:
-          // 835
-          archs->push_back(kA73);
-          break;
-        case 0x205:
-          // 820
-          archs->push_back(kA72);
-          break;
-        default:
-          LOG(ERROR) << "unknow type";
-          archs->push_back(kARMArch_UNKOWN);
-      }
      }
    }
-  fclose(fp);
+  } else {
-  int cpu_count = arm_get_cpucount();
+    mode_ = LITE_POWER_LOW;
-  if (archs->size() < cpu_count) {
+    LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
-    for (int i = archs->size(); i < cpu_count; ++i) {
+    if (thread_num > little_core_size) {
-      archs->push_back(archs->at(i - 1));
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
      }
    }
-#endif
-#ifdef TARGET_IOS
-  int cpu_count = arm_get_cpucount();
-  for (int i = 0; i < cpu_count; ++i) {
-    archs->push_back(APPLE);
  }
-#endif
 }
-#ifdef LITE_WITH_LINUX
+void DeviceInfo::RequestPowerLowMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
-void set_default_cache(DeviceInfo* dev) {
+  int little_core_size = little_core_ids_.size();
-  int cpu_count = arm_get_cpucount();
+  active_ids_.clear();
-  dev->L1_cache_.resize(cpu_count);
+  if (little_core_size > 0) {
-  dev->L2_cache_.resize(cpu_count);
+    mode_ = LITE_POWER_LOW;
-  dev->L3_cache_.resize(cpu_count);
+    if (thread_num > little_core_size) {
-#ifdef TARGET_IOS
+      LOG(WARNING) << "Request thread num: " << thread_num
-  for (int i = 0; i < cpu_count; ++i) {
+                   << ", exceed the little cores size: " << little_core_size
-    dev->L1_cache_[i] = 64 * 1024;
+                   << ", truncate thread num to " << little_core_size;
-    dev->L2_cache_[i] = 2048 * 1024;
+      active_ids_ = little_core_ids_;
-    dev->L3_cache_[i] = 0;
+    } else {
-  }
+      for (int i = 0; i < thread_num; i++) {
-#else
+        active_ids_.push_back(little_core_ids_[i]);
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 32 * 1024;
-    dev->L2_cache_[i] = 512 * 1024;
-    dev->L3_cache_[i] = 0;
      }
-#endif
-}
-std::string arm_get_cpu_name() {
-  FILE* fp = fopen("/proc/cpuinfo", "rb");
-  if (!fp) {
-    return "";
    }
-  char line[1024];
+  } else {
-  while (!feof(fp)) {
+    mode_ = LITE_POWER_HIGH;
-    char* s = fgets(line, 1024, fp);
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-    if (!s) {
+    if (thread_num > big_core_size) {
-      break;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(big_core_ids_[i]);
      }
-    if (strstr(line, "Hardware") != NULL) {
-      fclose(fp);
-      return std::string(line);
    }
  }
-  fclose(fp);
-  return "";
 }
-int get_max_freq_khz(int cpuid) {
+void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
-  // first try, for all possible cpu
+  active_ids_.clear();
-  char path[256];
+  for (int i = 0; i < thread_num; i++) {
-  snprintf(path, sizeof(path),
+    active_ids_.push_back(0);
-           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
-  FILE* fp = fopen(path, "rb");
-  if (!fp) {
-    // second try, for online cpu
-    snprintf(path, sizeof(path),
-             "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
-             cpuid);
-    fp = fopen(path, "rb");
-    if (!fp) {
-      // third try, for online cpu
-      snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
-      fp = fopen(path, "rb");
-      if (!fp) {
-        return -1;
  }
+  mode_ = LITE_POWER_NO_BIND;
+}
-      int max_freq_khz = -1;
+void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
-      fscanf(fp, "%d", &max_freq_khz);
+                                          const int thread_num) {
+  int big_core_size = big_core_ids_.size();
-      fclose(fp);
+  int little_core_size = little_core_ids_.size();
+  if (big_core_size > 0) {
-      return max_freq_khz;
+    mode_ = LITE_POWER_RAND_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the big cores size: " << big_core_size
+                   << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
      }
    }
+  } else {
-  int max_freq_khz = 0;
+    mode_ = LITE_POWER_LOW;
-  while (!feof(fp)) {
+    LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
-    int freq_khz = 0;
+    if (thread_num > little_core_size) {
-    int nscan = fscanf(fp, "%d %*d", &freq_khz);
+      active_ids_ = little_core_ids_;
-    if (nscan != 1) {
+    } else {
-      break;
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
      }
-    if (freq_khz > max_freq_khz) {
-      max_freq_khz = freq_khz;
    }
  }
-  fclose(fp);
-  return max_freq_khz;
 }
-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
+void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
-                                    const std::vector<int>& cpu_freq,
+                                         const int thread_num) {
-                                    std::vector<int>* cluster_ids) {
+  int big_core_size = big_core_ids_.size();
-  if (cpu_count == 0) {
+  int little_core_size = little_core_ids_.size();
-    return 0;
+  active_ids_.clear();
-  }
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_RAND_LOW;
-  cpuids->resize(cpu_count);
+    if (thread_num > little_core_size) {
-  cluster_ids->resize(cpu_count);
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
-  for (int i = 0; i < cpu_count; i++) {
+                   << ", truncate thread num to " << little_core_size;
-    cpuids->at(i) = i;
+      active_ids_ = little_core_ids_;
-  }
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
-  // sort cpuid as big core first
+        active_ids_.push_back(
-  // simple bubble sort
+            little_core_ids_[(i + shift_num) % little_core_size]);
-  for (int i = 0; i < cpu_count; i++) {
-    for (int j = i + 1; j < cpu_count; j++) {
-      if (cpu_freq[i] < cpu_freq[j]) {
-        // swap
-        int tmp = cpuids->at(i);
-        cpuids->at(i) = cpuids->at(j);
-        cpuids->at(j) = tmp;
-      }
      }
    }
-  // SMP
-  int mid_max_freq_khz =
-      (cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
-    if (cpu_freq[i] >= mid_max_freq_khz) {
-      cluster_ids->at(i) = 0;
  } else {
-      cluster_ids->at(i) = 1;
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
    }
  }
-  return 0;
 }
-int check_online(const std::vector<int>& core_ids) {
+int DeviceInfo::Setup() {
-  if (core_ids.size() == 0) {
+  core_num_ = get_cpu_num();
-    return 0;
+  mem_size_ = get_mem_size();
-  }
+  get_cpu_arch(&archs_, core_num_);
-  char path[256];
+  // set defalut CPU info
-  int online = 1;
+  SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE);
-  for (int i = 0; i < core_ids.size(); ++i) {
+  SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE);
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
+  SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE);
-             core_ids[i]);
+#ifdef LITE_WITH_LINUX
-    FILE* fp = fopen(path, "rb");
+  // get max&min freq
-    if (!fp) {
+  max_freqs_.resize(core_num_);
+  min_freqs_.resize(core_num_);
+  for (int i = 0; i < core_num_; ++i) {
+    int max_freq, min_freq;
+    get_cpu_max_min_freq(i, &max_freq, &min_freq);
+    max_freqs_[i] = max_freq / 1000;
+    min_freqs_[i] = min_freq / 1000;
+  }
+  // get cache size and big.LITTLE core ids
+  dev_name_ = get_cpu_name();
+  if (!SetCPUInfoByName()) {
+    SetCPUInfoByProb();
+  }
+  // output info
+  LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
+  LOG(INFO) << "ARM multiprocessors number: " << core_num_;
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
+              << ", max freq: " << max_freqs_[i]
+              << ", min freq: " << min_freqs_[i]
+              << ", cluster ID: " << cluster_ids_[core_ids_[i]]
+              << ", CPU ARCH: A" << archs_[i];
+  }
+  LOG(INFO) << "L1 DataCache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L1_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "L2 Cache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L2_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "Total memory: " << mem_size_ << "KB";
+#endif
+  // set default run mode
+  SetRunMode(LITE_POWER_NO_BIND, 1);  // use single thread by default
  return 0;
-    }
-    int cur_online = 0;
-    fscanf(fp, "%d", &cur_online);
-    online &= cur_online;
-    fclose(fp);
-  }
-  return online;
 }
-int set_sched_affinity(const std::vector<int>& cpuids) {
+void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
-// #define CPU_SETSIZE 1024
+#ifdef ARM_WITH_OMP
-// #define __NCPUBITS  (8 * sizeof (unsigned long))
+  thread_num = std::min(thread_num, core_num_);
-// typedef struct
-// {
-//    unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
-// } cpu_set_t;
-// set affinity for thread
-#ifdef __GLIBC__
-  pid_t pid = syscall(SYS_gettid);
 #else
-  pid_t pid = gettid();
+  thread_num = 1;  // force thread_num to 1 if OpenMP is disabled
 #endif
-  cpu_set_t mask;
+#ifdef LITE_WITH_LINUX
-  CPU_ZERO(&mask);
+  int big_core_size = big_core_ids_.size();
-  for (int i = 0; i < cpuids.size(); i++) {
+  int little_core_size = little_core_ids_.size();
-    CPU_SET(cpuids[i], &mask);
+  int big_little_core_size = big_core_size + little_core_size;
+  thread_num = std::min(thread_num, big_little_core_size);
+  count_++;
+  int shift_num = (count_ / 10) % big_core_size;
+  switch (mode) {
+    case LITE_POWER_FULL:
+      RequestPowerFullMode(thread_num);
+      break;
+    case LITE_POWER_HIGH:
+      RequestPowerHighMode(thread_num);
+      break;
+    case LITE_POWER_LOW:
+      RequestPowerLowMode(thread_num);
+      break;
+    case LITE_POWER_NO_BIND:
+      RequestPowerNoBindMode(thread_num);
+      break;
+    case LITE_POWER_RAND_HIGH:
+      RequestPowerRandHighMode(shift_num, thread_num);
+      break;
+    case LITE_POWER_RAND_LOW:
+      RequestPowerRandLowMode(shift_num, thread_num);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported power mode: " << mode;
+      break;
+  }
+  if (active_ids_.size() == 0) {
+    active_ids_.push_back(0);
+  }
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
+  if (mode_ != LITE_POWER_NO_BIND) {
+    if (check_cpu_online(active_ids_)) {
+      bind_threads(active_ids_);
+    } else {
+      LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
+      mode_ = LITE_POWER_NO_BIND;
    }
-  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
-  if (syscallret) {
-    LOG(ERROR) << "syscall error " << syscallret;
-    return -1;
  }
+#else  // LITE_WITH_LINUX
+  // only LITE_POWER_NO_BIND is supported in other OS
+  RequestPowerNoBindMode(thread_num);
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
+#endif  // LITE_WITH_LINUX
+  //! alloc memory for sgemm in this context
+  workspace_.Resize(
+      {static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
+  arch_ = archs_[active_ids_[0]];
+}
-  return 0;
+void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
+  SetCacheInfo(0, l1size);
+  SetCacheInfo(1, l2size);
+  SetCacheInfo(2, l3size);
+  workspace_.Resize({2 * (l1size + l2size)});
 }
-#endif  // LITE_WITH_LINUX
+bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
+  auto count = dims.product();
+  auto old = workspace_.dims();
+  if (count == old.product()) {
+    return false;
+  }
+  workspace_.Resize({static_cast<int64_t>(
+      count + L2_cache_[active_ids_[0]] / sizeof(float))});
+  return true;
+}
 #endif  // LITE_WITH_ARM

--- a/paddle/fluid/lite/core/cpu_info.h
+++ b/paddle/fluid/lite/core/cpu_info.h
@@ -14,6 +14,7 @@
 #pragma once
+#include <cstdarg>
 #include <string>
 #include <vector>
 #include "paddle/fluid/lite/core/lite_tensor.h"
@@ -47,92 +48,73 @@ typedef enum {
 class DeviceInfo {
 public:
-  int idx_;
-  int max_freq_;
-  int min_freq_;
-  int generate_arch_;
-  int compute_core_num_;
-  int max_memory_;
-  int sharemem_size_;
-  std::string device_name_;
-  std::string compute_ability_;
-  std::vector<int> L1_cache_;
-  std::vector<int> L2_cache_;
-  std::vector<int> L3_cache_;
-  std::vector<int> core_ids_;
-  std::vector<int> big_core_ids_;
-  std::vector<int> little_core_ids_;
-  std::vector<int> cluster_ids_;
-  std::vector<ARMArch> archs_;
-  ARMArch arch_;
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
  static DeviceInfo& Global() {
    static auto* x = new DeviceInfo;
    return *x;
  }
-  static void Init() {
+  static int Init() {
-    auto& info = Global();
+    static int ret = Global().Setup();
-    InitInternal(&info);
+    return ret;
  }
-  void SetRunMode(PowerMode mode, int threads);
+  int Setup();
+  void SetRunMode(PowerMode mode, int thread_num);
  void SetCache(int l1size, int l2size, int l3size);
  void SetArch(ARMArch arch) { arch_ = arch; }
-  void BindDev();
  PowerMode mode() const { return mode_; }
  int threads() const { return active_ids_.size(); }
  ARMArch arch() const { return arch_; }
+  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
+  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
+  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
  template <typename T>
  T* workspace_data() {
    return workspace_.mutable_data<T>();
  }
-  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
-  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
-  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
  bool ExtendWorkspace(DDimLite dims);
 private:
-  DeviceInfo() = default;
+  int core_num_;
-  static void InitInternal(DeviceInfo* dev);
+  std::vector<int> max_freqs_;
-};
+  std::vector<int> min_freqs_;
+  int mem_size_;
-size_t arm_get_meminfo();
+  std::string dev_name_;
-int arm_get_cpucount();
-void arm_get_cpu_arch(std::vector<ARMArch>* archs);
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
-#ifdef LITE_WITH_LINUX
+  std::vector<int> L1_cache_;
+  std::vector<int> L2_cache_;
-void set_default_cache(DeviceInfo* dev);
+  std::vector<int> L3_cache_;
+  std::vector<int> core_ids_;
-std::string arm_get_cpu_name();
+  std::vector<int> big_core_ids_;
+  std::vector<int> little_core_ids_;
+  std::vector<int> cluster_ids_;
+  std::vector<ARMArch> archs_;
-int get_max_freq_khz(int cpuid);
+  ARMArch arch_;
+  // LITE_POWER_HIGH stands for using big cores,
+  // LITE_POWER_LOW stands for using small core,
+  // LITE_POWER_FULL stands for using all cores
+  PowerMode mode_;
+  std::vector<int> active_ids_;
+  TensorLite workspace_;
+  int64_t count_{0};
-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
+  void SetCacheInfo(int cache_id, int argc, ...);
-                                    const std::vector<int>& cpu_freq,
+  void SetArchInfo(int argc, ...);
-                                    std::vector<int>* cluster_ids);
+  bool SetCPUInfoByName();
-int check_online(const std::vector<int>& core_ids);
+  void SetCPUInfoByProb();
-int set_sched_affinity(const std::vector<int>& cpuids);
+  void RequestPowerFullMode(const int thread_num);
+  void RequestPowerHighMode(const int thread_num);
+  void RequestPowerLowMode(const int thread_num);
+  void RequestPowerNoBindMode(const int thread_num);
+  void RequestPowerRandHighMode(const int shift_num, const int thread_num);
+  void RequestPowerRandLowMode(const int shift_num, const int thread_num);
-#endif  // LITE_WITH_LINUX
+  DeviceInfo() = default;
+};
 #endif  // LITE_WITH_ARM