make all ArmContext share the same DeviceInfo, and export SetRunMode to set thread num

test=develop

make all ArmContext share the same DeviceInfo, and export SetRunMode to set thread num
test=develop
c8d89a2b · hong19860320 · ce6c24e6 · c8d89a2b · c8d89a2b · c8d89a2b
9 changed file
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -28,9 +28,10 @@ double time_diff(Time t1, Time t2) {
  return counter.count() / 1000.0;
 }

-void Run(const char* model_dir, int repeat) {
+void Run(const char* model_dir, int repeat, int thread_num) {
 #ifdef LITE_WITH_ARM
  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
 #endif
  lite::ExecutorLite predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
@@ -66,8 +67,8 @@ void Run(const char* model_dir, int repeat) {
 }  // namespace paddle

 int main(int argc, char** argv) {
-  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  CHECK_EQ(argc, 4) << "usage: ./cmd <model_dir> <repeat> <thread_num>";
+  paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3]));

  return 0;
 }

--- a/paddle/fluid/lite/core/context.cc
+++ b/paddle/fluid/lite/core/context.cc
@@ -13,322 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/context.h"
-#include "paddle/fluid/lite/core/cpu_info.h"
-
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif

 namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-
-void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int cpu_count = arm_get_cpucount();
-  dev.L1_cache_.resize(cpu_count);
-  dev.L2_cache_.resize(cpu_count);
-  dev.L3_cache_.resize(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    dev.L1_cache_[i] = l1size;
-    dev.L2_cache_[i] = l2size;
-    dev.L3_cache_[i] = l3size;
-  }
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-
-Context<TargetType::kARM>::Context() {
-  active_ids_ = {0};
-  mode_ = LITE_POWER_HIGH;
-  DeviceInfo& dev = DeviceInfo::Global();
-  workspace_.Resize(
-      {static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
-#ifdef TARGET_IOS
-  arch_ = APPLE;  // use 6x8
-#else
-  if (dev.big_core_ids_.size() > 0) {
-    arch_ = dev.archs_[dev.big_core_ids_[0]];
-  }
-#endif
-}
-
-PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
-
-int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
-
-Context<TargetType::kARM>::Context(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-}
-
-ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-  return *this;
-}
-
-void Context<TargetType::kARM>::BindDev() {
-#ifdef ARM_WITH_OMP
-  int num_threads = active_ids_.size();
-  omp_set_num_threads(num_threads);
-#ifdef LITE_WITH_LINUX
-  std::vector<int> ssarets;
-  for (int j = 0; j < num_threads; ++j) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(active_ids_);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
-      return;
-    }
-  }
-#endif  // LITE_WITH_LINUX
-#else   // ARM_WITH_OMP
-#ifdef LITE_WITH_LINUX
-  std::vector<int> cpuid1;
-  cpuid1.push_back(active_ids_[0]);
-  int ssaret = set_sched_affinity(cpuid1);
-  if (ssaret != 0) {
-    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
-    return;
-  }
-#endif  // LITE_WITH_LINUX
-#endif  // ARM_WITH_OMP
-}
-
-void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int big_core_size = dev.big_core_ids_.size();
-  int small_core_size = dev.little_core_ids_.size();
-  if (threads > big_core_size + small_core_size) {
-    threads = big_core_size + small_core_size;
-  }
-#ifdef ARM_WITH_OMP
-  count_++;
-  int shift_num = (count_ / 10) % big_core_size;
-  switch (mode) {
-    case LITE_POWER_FULL:
-      mode_ = mode;
-      active_ids_.clear();
-      for (int i = 0; i < threads; ++i) {
-        if (i < big_core_size) {
-          active_ids_.push_back(dev.big_core_ids_[i]);
-        } else {
-          active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_HIGH;
-        if (threads > big_core_size) {
-          LOG(ERROR) << "threads: " << threads
-                     << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_NO_BIND:
-      mode_ = LITE_POWER_NO_BIND;
-      active_ids_.clear();
-      if (threads > dev.core_ids_.size()) {
-        active_ids_.resize(dev.core_ids_.size());
-      } else {
-        active_ids_.resize(threads);
-      }
-      break;
-    case LITE_POWER_RAND_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_RAND_HIGH;
-        if (threads > big_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.big_core_ids_[(i + shift_num) % big_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(WARNING)
-            << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_RAND_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_RAND_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.little_core_ids_[(i + shift_num) % small_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-  }
-  //! fix multi-threads LITE_POWER_HIGH mode
-  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
-    int threads = active_ids_.size();
-    omp_set_num_threads(threads);
-  } else {
-    if (check_online(active_ids_)) {
-      BindDev();
-    } else {
-      LOG(ERROR) << "core id " << active_ids_[0]
-                 << " is offline, switch to NO BIND MODE";
-      int threads = active_ids_.size();
-      omp_set_num_threads(threads);
-    }
-  }
-#else
-  if (big_core_size > 0) {
-    active_ids_ = {dev.big_core_ids_[0]};
-  } else {
-    active_ids_ = {0};
-  }
-#endif
-  //! alloc memory for sgemm in this context
-  int temp_mem_size =
-      DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
-  workspace_.Resize({temp_mem_size});
-  arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
-}
-
-ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
-
-void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
-
-int Context<TargetType::kARM>::l1_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L1_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l2_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L2_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l3_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L3_cache_[active_ids_[0]];
-}
-
-bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
-  auto count = dims.product();
-  auto old = workspace_.dims();
-  if (count == old.product()) {
-    return false;
-  }
-
-  workspace_.Resize(
-      {static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
-  return true;
-}
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
+namespace lite {}  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -61,47 +61,42 @@ class Context<TargetType::kHost> {
 template <>
 class Context<TargetType::kARM> {
 public:
-  Context();
-  Context(PowerMode mode, int threads);
+  Context() {}
  explicit Context(const ARMContext& ctx);

-  ARMContext& operator=(const ARMContext& ctx);
+  ARMContext& operator=(const ARMContext& ctx) {}

  // NOTE: InitOnce should only be used by ContextScheduler
-  void InitOnce() { DeviceInfo::Init(); }
+  void InitOnce() {}

  void CopyShared(const ARMContext* ctx) {}

-  void SetRunMode(PowerMode mode, int threads);
-  void SetCache(int l1size, int l2size, int l3size);
-  void SetArch(ARMArch arch);
-  void BindDev();
+  void SetRunMode(PowerMode mode, int threads) {
+    return DeviceInfo::Global().SetRunMode(mode, threads);
+  }
+  void SetCache(int l1size, int l2size, int l3size) {
+    return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
+  }
+  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
+  void BindDev() { return DeviceInfo::Global().BindDev(); }

-  PowerMode mode() const;
-  int threads() const;
-  ARMArch arch() const;
+  PowerMode mode() const { return DeviceInfo::Global().mode(); }
+  int threads() const { return DeviceInfo::Global().threads(); }
+  ARMArch arch() const { return DeviceInfo::Global().arch(); }

  template <typename T>
  T* workspace_data() {
-    return workspace_.mutable_data<T>();
+    return DeviceInfo::Global().workspace_data<T>();
  }

-  int l1_cache_size() const;
-  int l2_cache_size() const;
-  int l3_cache_size() const;
-  bool ExtendWorkspace(DDimLite dims);
+  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
+  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
+  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
+  bool ExtendWorkspace(DDimLite dims) {
+    return DeviceInfo::Global().ExtendWorkspace(dims);
+  }

  std::string name() const { return "ARMContext"; }
-
- private:
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  ARMArch arch_;
-  PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
 };
 #endif


--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -12,8 +12,24 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifdef LITE_WITH_LINUX
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#if __APPLE__
+#include "TargetConditionals.h"
+#if TARGET_OS_IPHONE
+#include <mach/machine.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif  // TARGET_OS_IPHONE
+#endif  // __APPLE__
+
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
 #include "paddle/fluid/lite/core/cpu_info.h"
-#include <cstdarg>

 namespace paddle {
 namespace lite {
@@ -73,6 +89,252 @@ void DeviceInfo::InitInternal(DeviceInfo* dev) {
 #elif defined(TARGET_IOS)
  arm_get_cpu_arch(&dev->archs_);
 #endif
+  dev->active_ids_ = {0};
+  dev->mode_ = LITE_POWER_HIGH;
+  dev->workspace_.Resize({static_cast<int64_t>(
+      dev->L2_cache_[dev->active_ids_[0]] / sizeof(float))});
+#ifdef TARGET_IOS
+  dev->arch_ = APPLE;  // use 6x8
+#else
+  if (dev->big_core_ids_.size() > 0) {
+    dev->arch_ = dev->archs_[dev->big_core_ids_[0]];
+  }
+#endif
+}
+
+void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
+  int cpu_count = arm_get_cpucount();
+  L1_cache_.resize(cpu_count);
+  L2_cache_.resize(cpu_count);
+  L3_cache_.resize(cpu_count);
+  for (int i = 0; i < cpu_count; ++i) {
+    L1_cache_[i] = l1size;
+    L2_cache_[i] = l2size;
+    L3_cache_[i] = l3size;
+  }
+  workspace_.Resize({2 * (l1size + l2size)});
+}
+
+void DeviceInfo::BindDev() {
+#ifdef ARM_WITH_OMP
+  int num_threads = active_ids_.size();
+  omp_set_num_threads(num_threads);
+#ifdef LITE_WITH_LINUX
+  std::vector<int> ssarets;
+  for (int j = 0; j < num_threads; ++j) {
+    ssarets.push_back(0);
+  }
+#pragma omp parallel for
+  for (int i = 0; i < num_threads; i++) {
+    ssarets[i] = set_sched_affinity(active_ids_);
+  }
+  for (int i = 0; i < num_threads; i++) {
+    if (ssarets[i] != 0) {
+      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
+      return;
+    }
+  }
+#endif  // LITE_WITH_LINUX
+#else   // ARM_WITH_OMP
+#ifdef LITE_WITH_LINUX
+  std::vector<int> cpuid1;
+  cpuid1.push_back(active_ids_[0]);
+  int ssaret = set_sched_affinity(cpuid1);
+  if (ssaret != 0) {
+    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
+    return;
+  }
+#endif  // LITE_WITH_LINUX
+#endif  // ARM_WITH_OMP
+}
+
+void DeviceInfo::SetRunMode(PowerMode mode, int threads) {
+  LOG(INFO) << "ARM SetRunMode called";
+  int big_core_size = big_core_ids_.size();
+  int small_core_size = little_core_ids_.size();
+  if (threads > big_core_size + small_core_size) {
+    threads = big_core_size + small_core_size;
+  }
+#ifdef ARM_WITH_OMP
+  count_++;
+  int shift_num = (count_ / 10) % big_core_size;
+  switch (mode) {
+    case LITE_POWER_FULL:
+      mode_ = mode;
+      active_ids_.clear();
+      for (int i = 0; i < threads; ++i) {
+        if (i < big_core_size) {
+          active_ids_.push_back(big_core_ids_[i]);
+        } else {
+          active_ids_.push_back(little_core_ids_[i - big_core_size]);
+        }
+      }
+      if (active_ids_.size() == 0) {
+        active_ids_.push_back(0);
+      }
+      break;
+    case LITE_POWER_HIGH:
+      active_ids_.clear();
+      if (big_core_size > 0) {
+        mode_ = LITE_POWER_HIGH;
+        if (threads > big_core_size) {
+          LOG(ERROR) << "threads: " << threads
+                     << ", exceed the big cores size: " << big_core_size;
+          active_ids_ = big_core_ids_;
+        } else {
+          for (int i = 0; i < threads; ++i) {
+            active_ids_.push_back(big_core_ids_[i]);
+          }
+        }
+      } else {
+        mode_ = LITE_POWER_LOW;
+        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
+        if (threads > small_core_size) {
+          active_ids_ = little_core_ids_;
+        } else {
+          for (int i = 0; i < threads; ++i) {
+            active_ids_.push_back(little_core_ids_[i]);
+          }
+        }
+      }
+      if (active_ids_.size() == 0) {
+        active_ids_.push_back(0);
+      }
+      break;
+    case LITE_POWER_LOW:
+      active_ids_.clear();
+      if (small_core_size > 0) {
+        mode_ = LITE_POWER_LOW;
+        if (threads > small_core_size) {
+          LOG(WARNING) << "threads: " << threads
+                       << ", exceed the little cores size: " << small_core_size;
+          active_ids_ = little_core_ids_;
+        } else {
+          for (int i = 0; i < threads; ++i) {
+            active_ids_.push_back(little_core_ids_[i]);
+          }
+        }
+      } else {
+        mode_ = LITE_POWER_HIGH;
+        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
+        if (threads > big_core_size) {
+          active_ids_ = big_core_ids_;
+        } else {
+          for (int i = 0; i < threads; ++i) {
+            active_ids_.push_back(big_core_ids_[i]);
+          }
+        }
+      }
+      if (active_ids_.size() == 0) {
+        active_ids_.push_back(0);
+      }
+      break;
+    case LITE_POWER_NO_BIND:
+      mode_ = LITE_POWER_NO_BIND;
+      active_ids_.clear();
+      if (threads > core_ids_.size()) {
+        active_ids_.resize(core_ids_.size());
+      } else {
+        active_ids_.resize(threads);
+      }
+      break;
+    case LITE_POWER_RAND_HIGH:
+      active_ids_.clear();
+      if (big_core_size > 0) {
+        mode_ = LITE_POWER_RAND_HIGH;
+        if (threads > big_core_size) {
+          LOG(WARNING) << "threads: " << threads
+                       << ", exceed the big cores size: " << big_core_size;
+          active_ids_ = big_core_ids_;
+        } else {
+          for (int i = 0; i < threads; ++i) {
+            active_ids_.push_back(
+                big_core_ids_[(i + shift_num) % big_core_size]);
+          }
+        }
+      } else {
+        mode_ = LITE_POWER_LOW;
+        LOG(WARNING)
+            << "HIGH POWER MODE is not support, switch to little cores.";
+        if (threads > small_core_size) {
+          active_ids_ = little_core_ids_;
+        } else {
+          for (int i = 0; i < threads; ++i) {
+            active_ids_.push_back(little_core_ids_[i]);
+          }
+        }
+      }
+      if (active_ids_.size() == 0) {
+        active_ids_.push_back(0);
+      }
+      break;
+    case LITE_POWER_RAND_LOW:
+      active_ids_.clear();
+      if (small_core_size > 0) {
+        mode_ = LITE_POWER_RAND_LOW;
+        if (threads > small_core_size) {
+          LOG(WARNING) << "threads: " << threads
+                       << ", exceed the little cores size: " << small_core_size;
+          active_ids_ = little_core_ids_;
+        } else {
+          for (int i = 0; i < threads; ++i) {
+            active_ids_.push_back(
+                little_core_ids_[(i + shift_num) % small_core_size]);
+          }
+        }
+      } else {
+        mode_ = LITE_POWER_HIGH;
+        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
+        if (threads > big_core_size) {
+          active_ids_ = big_core_ids_;
+        } else {
+          for (int i = 0; i < threads; ++i) {
+            active_ids_.push_back(big_core_ids_[i]);
+          }
+        }
+      }
+      if (active_ids_.size() == 0) {
+        active_ids_.push_back(0);
+      }
+      break;
+  }
+  //! fix multi-threads LITE_POWER_HIGH mode
+  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
+    int threads = active_ids_.size();
+    omp_set_num_threads(threads);
+  } else {
+    if (check_online(active_ids_)) {
+      BindDev();
+    } else {
+      LOG(WARNING) << "core id " << active_ids_[0]
+                   << " is offline, switch to NO BIND MODE";
+      int threads = active_ids_.size();
+      omp_set_num_threads(threads);
+    }
+  }
+#else
+  if (big_core_size > 0) {
+    active_ids_ = {big_core_ids_[0]};
+  } else {
+    active_ids_ = {0};
+  }
+#endif
+  //! alloc memory for sgemm in this context
+  int temp_mem_size = L2_cache_[active_ids_[0]] / sizeof(float);
+  workspace_.Resize({temp_mem_size});
+  arch_ = archs_[active_ids_[0]];
+}
+
+bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
+  auto count = dims.product();
+  auto old = workspace_.dims();
+  if (count == old.product()) {
+    return false;
+  }
+
+  workspace_.Resize({static_cast<int64_t>(
+      count + L2_cache_[active_ids_[0]] / sizeof(float))});
+  return true;
 }

 // cache_id : 0 -> L1, 1 -> L2, 2 -> L3

--- a/paddle/fluid/lite/core/cpu_info.h
+++ b/paddle/fluid/lite/core/cpu_info.h
@@ -16,22 +16,9 @@

 #include <string>
 #include <vector>
+#include "paddle/fluid/lite/core/lite_tensor.h"
 #include "paddle/fluid/lite/utils/cp_logging.h"

-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
 namespace paddle {
 namespace lite {

@@ -80,6 +67,15 @@ class DeviceInfo {
  std::vector<int> cluster_ids_;
  std::vector<ARMArch> archs_;

+  ARMArch arch_;
+  // LITE_POWER_HIGH stands for using big cores,
+  // LITE_POWER_LOW stands for using small core,
+  // LITE_POWER_FULL stands for using all cores
+  PowerMode mode_;
+  std::vector<int> active_ids_;
+  TensorLite workspace_;
+  int64_t count_{0};
+
  static DeviceInfo& Global() {
    static auto* x = new DeviceInfo;
    return *x;
@@ -90,6 +86,25 @@ class DeviceInfo {
    InitInternal(&info);
  }

+  void SetRunMode(PowerMode mode, int threads);
+  void SetCache(int l1size, int l2size, int l3size);
+  void SetArch(ARMArch arch) { arch_ = arch; }
+  void BindDev();
+
+  PowerMode mode() const { return mode_; }
+  int threads() const { return active_ids_.size(); }
+  ARMArch arch() const { return arch_; }
+
+  template <typename T>
+  T* workspace_data() {
+    return workspace_.mutable_data<T>();
+  }
+
+  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
+  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
+  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
+  bool ExtendWorkspace(DDimLite dims);
+
 private:
  DeviceInfo() = default;
  static void InitInternal(DeviceInfo* dev);

--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
  auto o_dims = param.output->dims();

  auto& ctx = this->ctx_->template As<ARMContext>();
-  // TODO(xxx): make api and expose it
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);

  int win = x_dims[3];  // nchw
  int hin = x_dims[2];

--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
  auto w_dims = param.w->dims();

  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);

  CHECK_GE(x_dims.size(), 2UL);
  CHECK_EQ(w_dims.size(), 2UL);

--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -24,7 +24,6 @@ namespace arm {

 void MulCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }

 void MulCompute::Run() {

--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
@@ -26,7 +26,6 @@ namespace arm {

 void PoolCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }

 void PoolCompute::Run() {