make all ARMContext share the same DeviceInfo, and export SetRunMode to allow...

make all ARMContext share the same DeviceInfo, and export SetRunMode to allow users to set the thread num

make all ARMContext share the same DeviceInfo, and export SetRunMode to allow...
make all ARMContext share the same DeviceInfo, and export SetRunMode to allow users to set the thread num
ad333ac5 · Hong Ming · 6f705068 · 34491d6a · ad333ac5 · ad333ac5
9 changed file
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
  return counter.count() / 1000.0;
 }

-void Run(const char* model_dir, int repeat) {
+void Run(const char* model_dir, int repeat, int thread_num) {
 #ifdef LITE_WITH_ARM
  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
 #endif
  lite::ExecutorLite predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
 }  // namespace paddle

 int main(int argc, char** argv) {
-  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  CHECK_EQ(argc, 4) << "usage: ./cmd <model_dir> <repeat> <thread_num>";
+  paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3]));

  return 0;
 }

--- a/paddle/fluid/lite/core/context.cc
+++ b/paddle/fluid/lite/core/context.cc
@@ -13,322 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/context.h"
-#include "paddle/fluid/lite/core/cpu_info.h"
-
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif

 namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-
-void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int cpu_count = arm_get_cpucount();
-  dev.L1_cache_.resize(cpu_count);
-  dev.L2_cache_.resize(cpu_count);
-  dev.L3_cache_.resize(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    dev.L1_cache_[i] = l1size;
-    dev.L2_cache_[i] = l2size;
-    dev.L3_cache_[i] = l3size;
-  }
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-
-Context<TargetType::kARM>::Context() {
-  active_ids_ = {0};
-  mode_ = LITE_POWER_HIGH;
-  DeviceInfo& dev = DeviceInfo::Global();
-  workspace_.Resize(
-      {static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
-#ifdef TARGET_IOS
-  arch_ = APPLE;  // use 6x8
-#else
-  if (dev.big_core_ids_.size() > 0) {
-    arch_ = dev.archs_[dev.big_core_ids_[0]];
-  }
-#endif
-}
-
-PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
-
-int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
-
-Context<TargetType::kARM>::Context(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-}
-
-ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-  return *this;
-}
-
-void Context<TargetType::kARM>::BindDev() {
-#ifdef ARM_WITH_OMP
-  int num_threads = active_ids_.size();
-  omp_set_num_threads(num_threads);
-#ifdef LITE_WITH_LINUX
-  std::vector<int> ssarets;
-  for (int j = 0; j < num_threads; ++j) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(active_ids_);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
-      return;
-    }
-  }
-#endif  // LITE_WITH_LINUX
-#else   // ARM_WITH_OMP
-#ifdef LITE_WITH_LINUX
-  std::vector<int> cpuid1;
-  cpuid1.push_back(active_ids_[0]);
-  int ssaret = set_sched_affinity(cpuid1);
-  if (ssaret != 0) {
-    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
-    return;
-  }
-#endif  // LITE_WITH_LINUX
-#endif  // ARM_WITH_OMP
-}
-
-void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int big_core_size = dev.big_core_ids_.size();
-  int small_core_size = dev.little_core_ids_.size();
-  if (threads > big_core_size + small_core_size) {
-    threads = big_core_size + small_core_size;
-  }
-#ifdef ARM_WITH_OMP
-  count_++;
-  int shift_num = (count_ / 10) % big_core_size;
-  switch (mode) {
-    case LITE_POWER_FULL:
-      mode_ = mode;
-      active_ids_.clear();
-      for (int i = 0; i < threads; ++i) {
-        if (i < big_core_size) {
-          active_ids_.push_back(dev.big_core_ids_[i]);
-        } else {
-          active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_HIGH;
-        if (threads > big_core_size) {
-          LOG(ERROR) << "threads: " << threads
-                     << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_NO_BIND:
-      mode_ = LITE_POWER_NO_BIND;
-      active_ids_.clear();
-      if (threads > dev.core_ids_.size()) {
-        active_ids_.resize(dev.core_ids_.size());
-      } else {
-        active_ids_.resize(threads);
-      }
-      break;
-    case LITE_POWER_RAND_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_RAND_HIGH;
-        if (threads > big_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.big_core_ids_[(i + shift_num) % big_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(WARNING)
-            << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_RAND_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_RAND_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.little_core_ids_[(i + shift_num) % small_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-  }
-  //! fix multi-threads LITE_POWER_HIGH mode
-  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
-    int threads = active_ids_.size();
-    omp_set_num_threads(threads);
-  } else {
-    if (check_online(active_ids_)) {
-      BindDev();
-    } else {
-      LOG(ERROR) << "core id " << active_ids_[0]
-                 << " is offline, switch to NO BIND MODE";
-      int threads = active_ids_.size();
-      omp_set_num_threads(threads);
-    }
-  }
-#else
-  if (big_core_size > 0) {
-    active_ids_ = {dev.big_core_ids_[0]};
-  } else {
-    active_ids_ = {0};
-  }
-#endif
-  //! alloc memory for sgemm in this context
-  int temp_mem_size =
-      DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
-  workspace_.Resize({temp_mem_size});
-  arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
-}
-
-ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
-
-void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
-
-int Context<TargetType::kARM>::l1_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L1_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l2_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L2_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l3_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L3_cache_[active_ids_[0]];
-}
-
-bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
-  auto count = dims.product();
-  auto old = workspace_.dims();
-  if (count == old.product()) {
-    return false;
-  }
-
-  workspace_.Resize(
-      {static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
-  return true;
-}
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
+namespace lite {}  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
 template <>
 class Context<TargetType::kARM> {
 public:
-  Context();
-  Context(PowerMode mode, int threads);
+  Context() {}
  explicit Context(const ARMContext& ctx);

-  ARMContext& operator=(const ARMContext& ctx);
+  ARMContext& operator=(const ARMContext& ctx) {}

  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() { DeviceInfo::Init(); }

  void CopyShared(const ARMContext* ctx) {}

-  void SetRunMode(PowerMode mode, int threads);
-  void SetCache(int l1size, int l2size, int l3size);
-  void SetArch(ARMArch arch);
-  void BindDev();
+  void SetRunMode(PowerMode mode, int threads) {
+    return DeviceInfo::Global().SetRunMode(mode, threads);
+  }
+  void SetCache(int l1size, int l2size, int l3size) {
+    return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
+  }
+  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }

-  PowerMode mode() const;
-  int threads() const;
-  ARMArch arch() const;
+  PowerMode mode() const { return DeviceInfo::Global().mode(); }
+  int threads() const { return DeviceInfo::Global().threads(); }
+  ARMArch arch() const { return DeviceInfo::Global().arch(); }
+  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
+  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
+  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }

  template <typename T>
  T* workspace_data() {
-    return workspace_.mutable_data<T>();
+    return DeviceInfo::Global().workspace_data<T>();
  }

-  int l1_cache_size() const;
-  int l2_cache_size() const;
-  int l3_cache_size() const;
-  bool ExtendWorkspace(DDimLite dims);
+  bool ExtendWorkspace(DDimLite dims) {
+    return DeviceInfo::Global().ExtendWorkspace(dims);
+  }

  std::string name() const { return "ARMContext"; }
-
- private:
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  ARMArch arch_;
-  PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
 };
 #endif


--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -12,312 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifdef LITE_WITH_LINUX
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#if __APPLE__
+#include "TargetConditionals.h"
+#if TARGET_OS_IPHONE
+#include <mach/machine.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif  // TARGET_OS_IPHONE
+#endif  // __APPLE__
+
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/lite/core/cpu_info.h"
-#include <cstdarg>

 namespace paddle {
 namespace lite {

 #ifdef LITE_WITH_ARM

-void DeviceInfo::InitInternal(DeviceInfo* dev) {
-  set_default_cache(dev);
-  dev->compute_core_num_ = arm_get_cpucount();
-  dev->max_memory_ = arm_get_meminfo();
-
-// get max freq
-#ifdef LITE_WITH_LINUX
-  std::vector<int> max_freq(dev->compute_core_num_);
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    max_freq[i] = get_max_freq_khz(i) / 1000;
-  }
-  std::string cpu_name = arm_get_cpu_name();
-  if (get_cpu_info_from_name(dev, cpu_name) != true) {
-    arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_,
-                                    max_freq, &dev->cluster_ids_);
-    dev->big_core_ids_.clear();
-    dev->little_core_ids_.clear();
-    for (int i = 0; i < dev->cluster_ids_.size(); ++i) {
-      if (dev->cluster_ids_[i] == 0) {
-        dev->big_core_ids_.push_back(dev->core_ids_[i]);
-      } else {
-        dev->little_core_ids_.push_back(dev->core_ids_[i]);
-      }
-    }
-    arm_get_cpu_arch(&dev->archs_);
-  }
-
-  LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_;
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i]
-              << ", frequence: " << max_freq[i]
-              << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
-              << ", CPU ARCH: A" << dev->archs_[i];
-  }
-  VLOG(1) << "L1 DataCache size is: ";
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
-  }
-  VLOG(1) << "L2 Cache size is: ";
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
-  }
-  VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
-
-  dev->max_freq_ = max_freq[0];
-  for (int j = 1; j < dev->compute_core_num_; ++j) {
-    if (dev->max_freq_ < max_freq[j]) {
-      dev->max_freq_ = max_freq[j];
-    }
-  }
-#elif defined(TARGET_IOS)
-  arm_get_cpu_arch(&dev->archs_);
+#ifdef TARGET_IOS
+const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
+#else
+const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
 #endif
-}

-// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
-void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  std::vector<int>* cache;
-  switch (cache_id) {
-    case 0:
-      cache = &cpu_info->L1_cache_;
-      break;
-    case 1:
-      cache = &cpu_info->L2_cache_;
-      break;
-    case 2:
-      cache = &cpu_info->L3_cache_;
-      break;
-    default:
+int get_cpu_num() {
+#ifdef LITE_WITH_LINUX
+  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
+  int max_cpu_num = 20;
+  int cpu_num = 0;
+  for (int i = 0; i < max_cpu_num; ++i) {
+    char path[256];
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
+    FILE* fp = fopen(path, "rb");
+    if (!fp) {
      break;
-  }
-  int core_num = cpu_info->compute_core_num_;
-  cache->resize(core_num);
-  if (argc == 1) {
-    int cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
-      (*cache)[i] = cache_size;
-    }
-  } else {
-    int big_core_num = cpu_info->big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
-    int big_core_cache_size = va_arg(arg_ptr, int);
-    int little_core_cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < big_core_num; ++i) {
-      (*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      (*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size;
    }
+    cpu_num++;
+    fclose(fp);
  }
-  va_end(arg_ptr);
-}
-
-void set_arch_info(DeviceInfo* cpu_info, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  int core_num = cpu_info->compute_core_num_;
-  cpu_info->archs_.resize(core_num);
-  if (argc == 1) {
-    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
-      cpu_info->archs_[i] = arch;
-    }
-  } else {
-    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    int big_core_num = cpu_info->big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
-    for (int i = 0; i < big_core_num; ++i) {
-      cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch;
-    }
+  if (cpu_num < 1) {
+    cpu_num = 1;
  }
-  va_end(arg_ptr);
-}
-
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) {
-  /* Snapdragon */
-  if (hardware_name.find("SDM845") != std::string::npos) {  // 845
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024);
-    set_cache_info(cpu_info, 2, 1, 2048 * 1024);
-    return true;
-
-  } else if (hardware_name.find("SDM710") != std::string::npos) {  // 710
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
-    return true;
-  } else if (hardware_name.find("MSM8998") != std::string::npos) {  // 835
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024,
-                   /*real cache size is 2M, while that will get bad performace
-                      on conv3x3s1 or gemm, set to 1M or 512K*/
-                   1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8996") != std::string::npos) {  // 820
-    cpu_info->compute_core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {2, 3};
-    cpu_info->little_core_ids_ = {0, 1};
-    cpu_info->cluster_ids_ = {1, 1, 0, 0};
-    set_arch_info(cpu_info, 1, kA72);
-    set_cache_info(cpu_info, 0, 1, 24 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("SDM660") != std::string::npos ||
-             hardware_name.find("SDM636") != std::string::npos) {  // 660, 636
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA73);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8976") != std::string::npos) {  // 652,653
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8953") != std::string::npos) {  // 625
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8939") != std::string::npos) {  // 615
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
-    set_arch_info(cpu_info, 1, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
-    return true;
-
-    /* MediaTek */
-
-  } else if (hardware_name.find("MT6797") !=
-             std::string::npos) {  // X20/X23/X25/X27
-    cpu_info->compute_core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    cpu_info->big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MT6799") != std::string::npos) {  // X30
-    cpu_info->compute_core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    cpu_info->big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6795") != std::string::npos ||
-             hardware_name.find("MT6762") != std::string::npos ||
-             hardware_name.find("MT6755T") != std::string::npos ||
-             hardware_name.find("MT6755S") != std::string::npos ||
-             hardware_name.find("MT6753") != std::string::npos ||
-             hardware_name.find("MT6752") != std::string::npos ||
-             hardware_name.find("MT6750") != std::string::npos) {
-    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6758") != std::string::npos ||
-             hardware_name.find("MT6757") != std::string::npos ||
-             hardware_name.find("MT6763") != std::string::npos ||
-             hardware_name.find("MT6755M") != std::string::npos ||
-             hardware_name.find("MT6755") !=
-                 std::string::npos) {  // P30, P20/P25, P23, P10
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6771") != std::string::npos) {  // P60
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6765") != std::string::npos ||
-             hardware_name.find("MT6739") != std::string::npos ||
-             hardware_name.find("MT6738") != std::string::npos ||
-             hardware_name.find("MT6737") !=
-                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
-    cpu_info->compute_core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {0, 0, 0, 0};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
+  return cpu_num;
+#elif defined(TARGET_IOS)
+  int cpu_num = 0;
+  size_t len = sizeof(cpu_num);
+  sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
+  if (cpu_num < 1) {
+    cpu_num = 1;
  }
-  return false;
+  return cpu_num;
+#else
+  return 1;
+#endif
 }

-size_t arm_get_meminfo() {
+size_t get_mem_size() {
 #ifdef LITE_WITH_LINUX
  // get cpu count from /proc/cpuinfo
  FILE* fp = fopen("/proc/meminfo", "rb");
  if (!fp) {
    return 1;
  }
-
  size_t memsize = 0;
  char line[1024];
  while (!feof(fp)) {
@@ -327,52 +96,18 @@ size_t arm_get_meminfo() {
    }
    sscanf(s, "MemTotal:        %d kB", &memsize);
  }
-
  fclose(fp);
-
  return memsize;
 #elif defined(TARGET_IOS)
  // to be implemented
  printf("not implemented\n");
-  return 0;
-#endif
-}
-
-int arm_get_cpucount() {
-#ifdef LITE_WITH_LINUX
-  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_count = 20;
-  int count = 0;
-  for (int i = 0; i < max_cpu_count; ++i) {
-    char path[256];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
-    FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      break;
-    }
-    count++;
-    fclose(fp);
-  }
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#elif defined(TARGET_IOS)
-  int count = 0;
-  size_t len = sizeof(count);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#else
-  return 1;
 #endif
+  return 0;
 }

-void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
-#ifdef LITE_WITH_LINUX
+void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
  archs->clear();
+#ifdef LITE_WITH_LINUX
  //! get CPU ARCH
  FILE* fp = fopen("/proc/cpuinfo", "rb");
  if (!fp) {
@@ -406,6 +141,29 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
        case 0xd0a:
          archs->push_back(kA75);
          break;
+        case 0xd40:
+          archs->push_back(kA76);
+          break;
+        case 0x804:
+          // 855
+          archs->push_back(kA76);
+          break;
+        case 0x805:
+          // 855
+          archs->push_back(kA55);
+          break;
+        case 0x802:
+          // 845
+          archs->push_back(kA75);
+          break;
+        case 0x803:
+          // 845
+          archs->push_back(kA55);
+          break;
+        case 0x801:
+          // 835
+          archs->push_back(kA73);
+          break;
        case 0x800:
          // 835
          archs->push_back(kA73);
@@ -415,49 +173,31 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
          archs->push_back(kA72);
          break;
        default:
-          LOG(ERROR) << "unknow type";
+          LOG(ERROR) << "Unknow cpu arch: " << arch_id;
          archs->push_back(kARMArch_UNKOWN);
      }
    }
  }
  fclose(fp);
-  int cpu_count = arm_get_cpucount();
-  if (archs->size() < cpu_count) {
-    for (int i = archs->size(); i < cpu_count; ++i) {
+  if (archs->size() < cpu_num) {
+    for (int i = archs->size(); i < cpu_num; ++i) {
      archs->push_back(archs->at(i - 1));
    }
  }
-#endif
-#ifdef TARGET_IOS
-  int cpu_count = arm_get_cpucount();
-  for (int i = 0; i < cpu_count; ++i) {
+#elif defined(TARGET_IOS)
+  for (int i = 0; i < cpu_num; ++i) {
    archs->push_back(APPLE);
  }
+#else
+  for (int i = 0; i < cpu_num; ++i) {
+    archs->push_back(kARMArch_UNKOWN);
+  }
 #endif
 }

 #ifdef LITE_WITH_LINUX

-void set_default_cache(DeviceInfo* dev) {
-  int cpu_count = arm_get_cpucount();
-  dev->L1_cache_.resize(cpu_count);
-  dev->L2_cache_.resize(cpu_count);
-  dev->L3_cache_.resize(cpu_count);
-#ifdef TARGET_IOS
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 64 * 1024;
-    dev->L2_cache_[i] = 2048 * 1024;
-    dev->L3_cache_[i] = 0;
-  }
-#else
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 32 * 1024;
-    dev->L2_cache_[i] = 512 * 1024;
-    dev->L3_cache_[i] = 0;
-  }
-#endif
-}
-std::string arm_get_cpu_name() {
+std::string get_cpu_name() {
  FILE* fp = fopen("/proc/cpuinfo", "rb");
  if (!fp) {
    return "";
@@ -477,122 +217,163 @@ std::string arm_get_cpu_name() {
  return "";
 }

-int get_max_freq_khz(int cpuid) {
+void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
+  *max_freq = 0;
+  *min_freq = 0;
  // first try, for all possible cpu
  char path[256];
  snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
-
+           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
  FILE* fp = fopen(path, "rb");
-
  if (!fp) {
    // second try, for online cpu
    snprintf(path, sizeof(path),
             "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
-             cpuid);
+             cpu_id);
    fp = fopen(path, "rb");
-
    if (!fp) {
      // third try, for online cpu
+      // get max_freq
      snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
+               cpu_id);
      fp = fopen(path, "rb");
-
      if (!fp) {
-        return -1;
+        return;
      }
-
-      int max_freq_khz = -1;
-      fscanf(fp, "%d", &max_freq_khz);
-
+      fscanf(fp, "%d", max_freq);
      fclose(fp);
-
-      return max_freq_khz;
+      // get min_freq
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
+               cpu_id);
+      fp = fopen(path, "rb");
+      if (!fp) {
+        return;
+      }
+      fscanf(fp, "%d", min_freq);
+      fclose(fp);
+      return;
    }
  }
-
-  int max_freq_khz = 0;
+  *min_freq = std::numeric_limits<int>::max();
  while (!feof(fp)) {
-    int freq_khz = 0;
-    int nscan = fscanf(fp, "%d %*d", &freq_khz);
+    int freq = 0;
+    int nscan = fscanf(fp, "%d %*d", &freq);
    if (nscan != 1) {
      break;
    }
-
-    if (freq_khz > max_freq_khz) {
-      max_freq_khz = freq_khz;
+    if (freq > *max_freq) {
+      *max_freq = freq;
+    }
+    if (freq < *min_freq) {
+      *min_freq = freq;
    }
  }
-
  fclose(fp);
-
-  return max_freq_khz;
 }

-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids) {
-  if (cpu_count == 0) {
-    return 0;
+void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
+                            std::vector<int>* cpu_ids,
+                            std::vector<int>* cluster_ids) {
+  int cpu_num = max_freqs.size();
+  if (cpu_num == 0) {
+    return;
  }
-
-  cpuids->resize(cpu_count);
-  cluster_ids->resize(cpu_count);
-
-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
+  cpu_ids->resize(cpu_num);
+  cluster_ids->resize(cpu_num);
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
  }
-
  // sort cpuid as big core first
  // simple bubble sort
-
-  for (int i = 0; i < cpu_count; i++) {
-    for (int j = i + 1; j < cpu_count; j++) {
-      if (cpu_freq[i] < cpu_freq[j]) {
+  for (int i = 0; i < cpu_num; i++) {
+    for (int j = i + 1; j < cpu_num; j++) {
+      if (max_freqs[i] < max_freqs[j]) {
        // swap
-        int tmp = cpuids->at(i);
-        cpuids->at(i) = cpuids->at(j);
-        cpuids->at(j) = tmp;
+        int tmp = cpu_ids->at(i);
+        cpu_ids->at(i) = cpu_ids->at(j);
+        cpu_ids->at(j) = tmp;
      }
    }
  }
  // SMP
-  int mid_max_freq_khz =
-      (cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
+  int mid_max_freq =
+      (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;

-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
-    if (cpu_freq[i] >= mid_max_freq_khz) {
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
+    if (max_freqs[i] >= mid_max_freq) {
      cluster_ids->at(i) = 0;
    } else {
      cluster_ids->at(i) = 1;
    }
  }
-  return 0;
 }

-int check_online(const std::vector<int>& core_ids) {
-  if (core_ids.size() == 0) {
-    return 0;
+void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
+                        int* l3_cache_size) {
+  int max_cache_idx_num = 10;
+  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
+  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
+  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
+  for (int i = 0; i < max_cache_idx_num; i++) {
+    char path[256];
+    snprintf(path, sizeof(path),
+             "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
+    FILE* fp = fopen(path, "rb");
+    if (fp) {
+      int level = -1;
+      fscanf(fp, "%d", &level);
+      fclose(fp);
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
+      fp = fopen(path, "rb");
+      if (fp) {
+        int size = -1;
+        fscanf(fp, "%d", &size);
+        fclose(fp);
+        if (size >= 0) {
+          if (level == 1) {
+            *l1_cache_size = size * 1024;
+          } else if (level == 2) {
+            *l2_cache_size = size * 1024;
+          } else if (level == 3) {
+            *l3_cache_size = size * 1024;
+          }
+        }
+      }
+    }
+  }
+}
+
+bool check_cpu_online(const std::vector<int>& cpu_ids) {
+  if (cpu_ids.size() == 0) {
+    return false;
  }
  char path[256];
-  int online = 1;
-  for (int i = 0; i < core_ids.size(); ++i) {
+  bool all_online = true;
+  for (int i = 0; i < cpu_ids.size(); ++i) {
    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
-             core_ids[i]);
+             cpu_ids[i]);
    FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      return 0;
+    int is_online = 0;
+    if (fp) {
+      fscanf(fp, "%d", &is_online);
+      fclose(fp);
+    } else {
+      LOG(ERROR) << "Failed to query the online statue of CPU id:"
+                 << cpu_ids[i];
+    }
+    if (is_online == 0) {
+      all_online = false;
+      LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
    }
-    int cur_online = 0;
-    fscanf(fp, "%d", &cur_online);
-    online &= cur_online;
-    fclose(fp);
  }
-  return online;
+  return all_online;
 }

-int set_sched_affinity(const std::vector<int>& cpuids) {
+int set_sched_affinity(const std::vector<int>& cpu_ids) {
 // #define CPU_SETSIZE 1024
 // #define __NCPUBITS  (8 * sizeof (unsigned long))
 // typedef struct
@@ -608,20 +389,569 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
 #endif
  cpu_set_t mask;
  CPU_ZERO(&mask);
-  for (int i = 0; i < cpuids.size(); i++) {
-    CPU_SET(cpuids[i], &mask);
+  for (int i = 0; i < cpu_ids.size(); ++i) {
+    CPU_SET(cpu_ids[i], &mask);
  }
-
  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
  if (syscallret) {
-    LOG(ERROR) << "syscall error " << syscallret;
    return -1;
  }
+  return 0;
+}
+
+bool bind_threads(const std::vector<int> cpu_ids) {
+#ifdef ARM_WITH_OMP
+  int thread_num = cpu_ids.size();
+  omp_set_num_threads(thread_num);
+  std::vector<int> ssarets;
+  for (int i = 0; i < thread_num; ++i) {
+    ssarets.push_back(0);
+  }
+#pragma omp parallel for
+  for (int i = 0; i < thread_num; i++) {
+    ssarets[i] = set_sched_affinity(cpu_ids);
+  }
+  for (int i = 0; i < thread_num; i++) {
+    if (ssarets[i] != 0) {
+      LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
+      return false;
+    }
+  }
+#else   // ARM_WITH_OMP
+  std::vector<int> first_cpu_id;
+  first_cpu_id.push_back(cpu_ids[0]);
+  int ssaret = set_sched_affinity(first_cpu_id);
+  if (ssaret != 0) {
+    LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
+    return false;
+  }
+#endif  // ARM_WITH_OMP
+}
+
+#endif  // LITE_WITH_LINUX
+
+// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
+void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
+  va_list arg_ptr;
+  va_start(arg_ptr, argc);
+  std::vector<int>* cache;
+  switch (cache_id) {
+    case 0:
+      cache = &L1_cache_;
+      break;
+    case 1:
+      cache = &L2_cache_;
+      break;
+    case 2:
+      cache = &L3_cache_;
+      break;
+    default:
+      break;
+  }
+  cache->resize(core_num_);
+  if (argc == 1) {
+    int cache_size = va_arg(arg_ptr, int);
+    for (int i = 0; i < core_num_; ++i) {
+      (*cache)[i] = cache_size;
+    }
+  } else {
+    int big_core_num = big_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
+    int big_core_cache_size = va_arg(arg_ptr, int);
+    int little_core_cache_size = va_arg(arg_ptr, int);
+    for (int i = 0; i < big_core_num; ++i) {
+      (*cache)[big_core_ids_[i]] = big_core_cache_size;
+    }
+    for (int i = 0; i < little_core_num; ++i) {
+      (*cache)[little_core_ids_[i]] = little_core_cache_size;
+    }
+  }
+  va_end(arg_ptr);
+}
+
+void DeviceInfo::SetArchInfo(int argc, ...) {
+  va_list arg_ptr;
+  va_start(arg_ptr, argc);
+  archs_.resize(core_num_);
+  if (argc == 1) {
+    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
+    for (int i = 0; i < core_num_; ++i) {
+      archs_[i] = arch;
+    }
+  } else {
+    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
+    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
+    int big_core_num = big_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
+    for (int i = 0; i < big_core_num; ++i) {
+      archs_[big_core_ids_[i]] = big_core_arch;
+    }
+    for (int i = 0; i < little_core_num; ++i) {
+      archs_[little_core_ids_[i]] = little_core_arch;
+    }
+  }
+  va_end(arg_ptr);
+}
+
+bool DeviceInfo::SetCPUInfoByName() {
+  /* Snapdragon */
+  if (dev_name_.find("SM8150") != std::string::npos) {  // 855
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA76, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM845") != std::string::npos) {  // 845
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM710") != std::string::npos) {  // 710
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {6, 7};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8998") != std::string::npos) {  // 835
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024,
+                 /*real cache size is 2M, while that will get bad performace
+                    on conv3x3s1 or gemm, set to 1M or 512K*/
+                 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8996") != std::string::npos) {  // 820
+    core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {2, 3};
+    little_core_ids_ = {0, 1};
+    cluster_ids_ = {1, 1, 0, 0};
+    SetArchInfo(1, kA72);
+    SetCacheInfo(0, 1, 24 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM660") != std::string::npos ||
+             dev_name_.find("SDM636") != std::string::npos) {  // 660, 636
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA73);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8976") != std::string::npos) {  // 652,653
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8953") != std::string::npos) {  // 625
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8939") != std::string::npos) {  // 615
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {4, 5, 6, 7};
+    cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
+    SetArchInfo(1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
+    return true;
+    /* MediaTek */
+  } else if (dev_name_.find("MT6797") !=
+             std::string::npos) {  // X20/X23/X25/X27
+    core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("MT6799") != std::string::npos) {  // X30
+    core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    return true;
+  } else if (dev_name_.find("MT6795") != std::string::npos ||
+             dev_name_.find("MT6762") != std::string::npos ||
+             dev_name_.find("MT6755T") != std::string::npos ||
+             dev_name_.find("MT6755S") != std::string::npos ||
+             dev_name_.find("MT6753") != std::string::npos ||
+             dev_name_.find("MT6752") != std::string::npos ||
+             dev_name_.find("MT6750") != std::string::npos) {
+    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  } else if (dev_name_.find("MT6758") != std::string::npos ||
+             dev_name_.find("MT6757") != std::string::npos ||
+             dev_name_.find("MT6763") != std::string::npos ||
+             dev_name_.find("MT6755M") != std::string::npos ||
+             dev_name_.find("MT6755") !=
+                 std::string::npos) {  // P30, P20/P25, P23, P10
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  } else if (dev_name_.find("MT6771") != std::string::npos) {  // P60
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    return true;
+  } else if (dev_name_.find("MT6765") != std::string::npos ||
+             dev_name_.find("MT6739") != std::string::npos ||
+             dev_name_.find("MT6738") != std::string::npos ||
+             dev_name_.find("MT6737") !=
+                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
+    core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  }
+  return false;
+}
+
+void DeviceInfo::SetCPUInfoByProb() {
+#ifdef LITE_WITH_LINUX
+  // get big.LITTLE cores by sorting CPU frequency
+  sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
+  big_core_ids_.clear();
+  little_core_ids_.clear();
+  for (int i = 0; i < cluster_ids_.size(); ++i) {
+    if (cluster_ids_[i] == 0) {
+      big_core_ids_.push_back(core_ids_[i]);
+    } else {
+      little_core_ids_.push_back(core_ids_[i]);
+    }
+  }
+  // get l1, l2, l3 cache size for each core
+  for (int i = 0; i < core_num_; i++) {
+    get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
+  }
+#endif  // LITE_WITH_LINUX
+}
+
+void DeviceInfo::RequestPowerFullMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  for (int i = 0; i < thread_num; ++i) {
+    if (i < big_core_size) {
+      active_ids_.push_back(big_core_ids_[i]);
+    } else if (i < big_core_size + little_core_size) {
+      active_ids_.push_back(little_core_ids_[i - big_core_size]);
+    }
+  }
+  mode_ = LITE_POWER_FULL;
+}
+
+void DeviceInfo::RequestPowerHighMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (big_core_size > 0) {
+    mode_ = LITE_POWER_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(ERROR) << "Request thread num: " << thread_num
+                 << ", exceed the big cores size: " << big_core_size
+                 << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_LOW;
+    LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
+    if (thread_num > little_core_size) {
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerLowMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_LOW;
+    if (thread_num > little_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
+                   << ", truncate thread num to " << little_core_size;
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
+  active_ids_.clear();
+  for (int i = 0; i < thread_num; i++) {
+    active_ids_.push_back(0);
+  }
+  mode_ = LITE_POWER_NO_BIND;
+}
+
+void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
+                                          const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  if (big_core_size > 0) {
+    mode_ = LITE_POWER_RAND_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the big cores size: " << big_core_size
+                   << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_LOW;
+    LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
+    if (thread_num > little_core_size) {
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
+                                         const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_RAND_LOW;
+    if (thread_num > little_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
+                   << ", truncate thread num to " << little_core_size;
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(
+            little_core_ids_[(i + shift_num) % little_core_size]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  }
+}

+int DeviceInfo::Setup() {
+  core_num_ = get_cpu_num();
+  mem_size_ = get_mem_size();
+  get_cpu_arch(&archs_, core_num_);
+  // set defalut CPU info
+  SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE);
+  SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE);
+  SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE);
+#ifdef LITE_WITH_LINUX
+  // get max&min freq
+  max_freqs_.resize(core_num_);
+  min_freqs_.resize(core_num_);
+  for (int i = 0; i < core_num_; ++i) {
+    int max_freq, min_freq;
+    get_cpu_max_min_freq(i, &max_freq, &min_freq);
+    max_freqs_[i] = max_freq / 1000;
+    min_freqs_[i] = min_freq / 1000;
+  }
+  // get cache size and big.LITTLE core ids
+  dev_name_ = get_cpu_name();
+  if (!SetCPUInfoByName()) {
+    SetCPUInfoByProb();
+  }
+  // output info
+  LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
+  LOG(INFO) << "ARM multiprocessors number: " << core_num_;
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
+              << ", max freq: " << max_freqs_[i]
+              << ", min freq: " << min_freqs_[i]
+              << ", cluster ID: " << cluster_ids_[core_ids_[i]]
+              << ", CPU ARCH: A" << archs_[i];
+  }
+  LOG(INFO) << "L1 DataCache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L1_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "L2 Cache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L2_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "Total memory: " << mem_size_ << "KB";
+#endif
+  // set default run mode
+  SetRunMode(LITE_POWER_NO_BIND, 1);  // use single thread by default
  return 0;
 }

+void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
+#ifdef ARM_WITH_OMP
+  thread_num = std::min(thread_num, core_num_);
+#else
+  thread_num = 1;  // force thread_num to 1 if OpenMP is disabled
+#endif
+#ifdef LITE_WITH_LINUX
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  int big_little_core_size = big_core_size + little_core_size;
+  thread_num = std::min(thread_num, big_little_core_size);
+  count_++;
+  int shift_num = (count_ / 10) % big_core_size;
+  switch (mode) {
+    case LITE_POWER_FULL:
+      RequestPowerFullMode(thread_num);
+      break;
+    case LITE_POWER_HIGH:
+      RequestPowerHighMode(thread_num);
+      break;
+    case LITE_POWER_LOW:
+      RequestPowerLowMode(thread_num);
+      break;
+    case LITE_POWER_NO_BIND:
+      RequestPowerNoBindMode(thread_num);
+      break;
+    case LITE_POWER_RAND_HIGH:
+      RequestPowerRandHighMode(shift_num, thread_num);
+      break;
+    case LITE_POWER_RAND_LOW:
+      RequestPowerRandLowMode(shift_num, thread_num);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported power mode: " << mode;
+      break;
+  }
+  if (active_ids_.size() == 0) {
+    active_ids_.push_back(0);
+  }
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
+  if (mode_ != LITE_POWER_NO_BIND) {
+    if (check_cpu_online(active_ids_)) {
+      bind_threads(active_ids_);
+    } else {
+      LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
+      mode_ = LITE_POWER_NO_BIND;
+    }
+  }
+#else  // LITE_WITH_LINUX
+  // only LITE_POWER_NO_BIND is supported in other OS
+  RequestPowerNoBindMode(thread_num);
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
 #endif  // LITE_WITH_LINUX
+  //! alloc memory for sgemm in this context
+  workspace_.Resize(
+      {static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
+  arch_ = archs_[active_ids_[0]];
+}
+
+void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
+  SetCacheInfo(0, l1size);
+  SetCacheInfo(1, l2size);
+  SetCacheInfo(2, l3size);
+  workspace_.Resize({2 * (l1size + l2size)});
+}
+
+bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
+  auto count = dims.product();
+  auto old = workspace_.dims();
+  if (count == old.product()) {
+    return false;
+  }
+  workspace_.Resize({static_cast<int64_t>(
+      count + L2_cache_[active_ids_[0]] / sizeof(float))});
+  return true;
+}

 #endif  // LITE_WITH_ARM


--- a/paddle/fluid/lite/core/cpu_info.h
+++ b/paddle/fluid/lite/core/cpu_info.h
@@ -14,24 +14,12 @@

 #pragma once

+#include <cstdarg>
 #include <string>
 #include <vector>
+#include "paddle/fluid/lite/core/lite_tensor.h"
 #include "paddle/fluid/lite/utils/cp_logging.h"

-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
 namespace paddle {
 namespace lite {

@@ -60,64 +48,73 @@ typedef enum {

 class DeviceInfo {
 public:
-  int idx_;
-  int max_freq_;
-  int min_freq_;
-  int generate_arch_;
-  int compute_core_num_;
-  int max_memory_;
-  int sharemem_size_;
-
-  std::string device_name_;
-  std::string compute_ability_;
-
-  std::vector<int> L1_cache_;
-  std::vector<int> L2_cache_;
-  std::vector<int> L3_cache_;
-  std::vector<int> core_ids_;
-  std::vector<int> big_core_ids_;
-  std::vector<int> little_core_ids_;
-  std::vector<int> cluster_ids_;
-  std::vector<ARMArch> archs_;
-
  static DeviceInfo& Global() {
    static auto* x = new DeviceInfo;
    return *x;
  }

-  static void Init() {
-    auto& info = Global();
-    InitInternal(&info);
+  static int Init() {
+    static int ret = Global().Setup();
+    return ret;
  }

- private:
-  DeviceInfo() = default;
-  static void InitInternal(DeviceInfo* dev);
-};
+  int Setup();

-size_t arm_get_meminfo();
+  void SetRunMode(PowerMode mode, int thread_num);
+  void SetCache(int l1size, int l2size, int l3size);
+  void SetArch(ARMArch arch) { arch_ = arch; }

-int arm_get_cpucount();
+  PowerMode mode() const { return mode_; }
+  int threads() const { return active_ids_.size(); }
+  ARMArch arch() const { return arch_; }
+  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
+  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
+  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }

-void arm_get_cpu_arch(std::vector<ARMArch>* archs);
-
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
-
-#ifdef LITE_WITH_LINUX
-
-void set_default_cache(DeviceInfo* dev);
+  template <typename T>
+  T* workspace_data() {
+    return workspace_.mutable_data<T>();
+  }
+  bool ExtendWorkspace(DDimLite dims);

-std::string arm_get_cpu_name();
+ private:
+  int core_num_;
+  std::vector<int> max_freqs_;
+  std::vector<int> min_freqs_;
+  int mem_size_;
+  std::string dev_name_;

-int get_max_freq_khz(int cpuid);
+  std::vector<int> L1_cache_;
+  std::vector<int> L2_cache_;
+  std::vector<int> L3_cache_;
+  std::vector<int> core_ids_;
+  std::vector<int> big_core_ids_;
+  std::vector<int> little_core_ids_;
+  std::vector<int> cluster_ids_;
+  std::vector<ARMArch> archs_;

-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids);
-int check_online(const std::vector<int>& core_ids);
-int set_sched_affinity(const std::vector<int>& cpuids);
+  ARMArch arch_;
+  // LITE_POWER_HIGH stands for using big cores,
+  // LITE_POWER_LOW stands for using small core,
+  // LITE_POWER_FULL stands for using all cores
+  PowerMode mode_;
+  std::vector<int> active_ids_;
+  TensorLite workspace_;
+  int64_t count_{0};
+
+  void SetCacheInfo(int cache_id, int argc, ...);
+  void SetArchInfo(int argc, ...);
+  bool SetCPUInfoByName();
+  void SetCPUInfoByProb();
+  void RequestPowerFullMode(const int thread_num);
+  void RequestPowerHighMode(const int thread_num);
+  void RequestPowerLowMode(const int thread_num);
+  void RequestPowerNoBindMode(const int thread_num);
+  void RequestPowerRandHighMode(const int shift_num, const int thread_num);
+  void RequestPowerRandLowMode(const int shift_num, const int thread_num);

-#endif  // LITE_WITH_LINUX
+  DeviceInfo() = default;
+};

 #endif  // LITE_WITH_ARM


--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
  auto o_dims = param.output->dims();

  auto& ctx = this->ctx_->template As<ARMContext>();
-  // TODO(xxx): make api and expose it
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);

  int win = x_dims[3];  // nchw
  int hin = x_dims[2];

--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
  auto w_dims = param.w->dims();

  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);

  CHECK_GE(x_dims.size(), 2UL);
  CHECK_EQ(w_dims.size(), 2UL);

--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -24,7 +24,6 @@ namespace arm {

 void MulCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }

 void MulCompute::Run() {

--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
@@ -26,7 +26,6 @@ namespace arm {

 void PoolCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }

 void PoolCompute::Run() {