Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into xzl/incubate/lite

58bf3c48 · nhzlx · 7e2ecbd6 · 758db8df · 58bf3c48 · 58bf3c48
15 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,6 +2,20 @@ before_script:
  - env
  - export CI_USER_DIR=$(pwd)

+  # prepare ccache
+  - apt install ccache
+
+  # for proxy
+  - export http_proxy=$CI_PROXY
+  - export https_proxy=$CI_PROXY
+
+  # merge the latest code
+  - git config --global user.email "you@example.com"
+  - git config --global user.name "Your Name"
+  - git fetch origin incubate/lite
+  - git merge --no-ff origin/incubate/lite
+
+
 image: $SERVER_LITE_DOCKER_IMAGE

 stages:
@@ -14,19 +28,13 @@ check:prebuilt:
        - lite
    stage: ci
    script:
+        # prepare for pre-commit
        - rm -rf ~/.pip
-        - export http_proxy=$CI_PROXY
-        - export https_proxy=$CI_PROXY
        - pip install pre-commit
        - pre-commit install

-        # merge the latest code
-        - git config --global user.email "you@example.com"
-        - git config --global user.name "Your Name"
-        - git fetch origin incubate/lite
-        - git merge --no-ff origin/incubate/lite
-
        - ./paddle/fluid/lite/tools/build.sh check_style
+
    cache:
        key: check_style
        paths:
@@ -42,17 +50,11 @@ build:server:
        paths:
            - build/third_party
            - ~/.ccache
+            - $CI_PROJECT_DIR/_build_server_ccache
    script:
-        - apt install ccache
-        - export http_proxy=$CI_PROXY
-        - export https_proxy=$CI_PROXY
-
-        # merge the latest code
-        - git config --global user.email "you@example.com"
-        - git config --global user.name "Your Name"
-        - git fetch origin incubate/lite
-        - git merge --no-ff origin/incubate/lite
-
+        # customize ccache path for specifying runner cache
+        - export CCACHE_DIR=$CI_PROJECT_DIR/_build_server_ccache
+        # run build and test
        - mkdir -p build
        - cd build
        - ../paddle/fluid/lite/tools/build.sh cmake_x86
@@ -66,7 +68,27 @@ build:server:
    dependencies:
        - check:prebuilt

-build:mobile:
+build:mobile_android:
+    tags:
+        - lite
+    stage: build_mobile
+    image: $MOBILE_LITE_DOCKER_IMAGE
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+            - $CI_PROJECT_DIR/build_mobile_ccache
+    script:
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_android
+
+    dependencies:
+        - build:server
+
+
+build:mobile_armlinux:
    tags:
        - lite
    stage: build_mobile
@@ -77,17 +99,43 @@ build:mobile:
            - $MOBILE_LITE_CACHE0
            - $MOBILE_LITE_CACHE1
            - ~/.ccache
+            - $CI_PROJECT_DIR/build_mobile_ccache2
    script:
-        - apt install ccache
-        - export http_proxy=$CI_PROXY
-        - export https_proxy=$CI_PROXY
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache2
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_armlinux
+
+    dependencies:
+        - build:server

-        # merge the latest code
-        - git config --global user.email "you@example.com"
-        - git config --global user.name "Your Name"
-        - git fetch origin incubate/lite
-        - git merge --no-ff origin/incubate/lite
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+
+build:mobile_model_mobilenetv2:
+    tags:
+        - lite
+    stage: build_mobile
+    image: $MOBILE_LITE_DOCKER_IMAGE
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+    script:
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model1

-        - ./paddle/fluid/lite/tools/build.sh build_test_arm
    dependencies:
        - build:server
+
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+            - $CI_PROJECT_DIR/build_mobile_model1
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
  return counter.count() / 1000.0;
 }

-void Run(const char* model_dir, int repeat) {
+void Run(const char* model_dir, int repeat, int thread_num) {
 #ifdef LITE_WITH_ARM
  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
 #endif
  lite::ExecutorLite predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
 }  // namespace paddle

 int main(int argc, char** argv) {
-  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  CHECK_EQ(argc, 4) << "usage: ./cmd <model_dir> <repeat> <thread_num>";
+  paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3]));

  return 0;
 }

--- a/paddle/fluid/lite/core/context.cc
+++ b/paddle/fluid/lite/core/context.cc
@@ -13,322 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/context.h"
-#include "paddle/fluid/lite/core/cpu_info.h"
-
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif

 namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-
-void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int cpu_count = arm_get_cpucount();
-  dev.L1_cache_.resize(cpu_count);
-  dev.L2_cache_.resize(cpu_count);
-  dev.L3_cache_.resize(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    dev.L1_cache_[i] = l1size;
-    dev.L2_cache_[i] = l2size;
-    dev.L3_cache_[i] = l3size;
-  }
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-
-Context<TargetType::kARM>::Context() {
-  active_ids_ = {0};
-  mode_ = LITE_POWER_HIGH;
-  DeviceInfo& dev = DeviceInfo::Global();
-  workspace_.Resize(
-      {static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
-#ifdef TARGET_IOS
-  arch_ = APPLE;  // use 6x8
-#else
-  if (dev.big_core_ids_.size() > 0) {
-    arch_ = dev.archs_[dev.big_core_ids_[0]];
-  }
-#endif
-}
-
-PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
-
-int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
-
-Context<TargetType::kARM>::Context(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-}
-
-ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-  return *this;
-}
-
-void Context<TargetType::kARM>::BindDev() {
-#ifdef ARM_WITH_OMP
-  int num_threads = active_ids_.size();
-  omp_set_num_threads(num_threads);
-#ifdef LITE_WITH_LINUX
-  std::vector<int> ssarets;
-  for (int j = 0; j < num_threads; ++j) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(active_ids_);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
-      return;
-    }
-  }
-#endif  // LITE_WITH_LINUX
-#else   // ARM_WITH_OMP
-#ifdef LITE_WITH_LINUX
-  std::vector<int> cpuid1;
-  cpuid1.push_back(active_ids_[0]);
-  int ssaret = set_sched_affinity(cpuid1);
-  if (ssaret != 0) {
-    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
-    return;
-  }
-#endif  // LITE_WITH_LINUX
-#endif  // ARM_WITH_OMP
-}
-
-void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int big_core_size = dev.big_core_ids_.size();
-  int small_core_size = dev.little_core_ids_.size();
-  if (threads > big_core_size + small_core_size) {
-    threads = big_core_size + small_core_size;
-  }
-#ifdef ARM_WITH_OMP
-  count_++;
-  int shift_num = (count_ / 10) % big_core_size;
-  switch (mode) {
-    case LITE_POWER_FULL:
-      mode_ = mode;
-      active_ids_.clear();
-      for (int i = 0; i < threads; ++i) {
-        if (i < big_core_size) {
-          active_ids_.push_back(dev.big_core_ids_[i]);
-        } else {
-          active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_HIGH;
-        if (threads > big_core_size) {
-          LOG(ERROR) << "threads: " << threads
-                     << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_NO_BIND:
-      mode_ = LITE_POWER_NO_BIND;
-      active_ids_.clear();
-      if (threads > dev.core_ids_.size()) {
-        active_ids_.resize(dev.core_ids_.size());
-      } else {
-        active_ids_.resize(threads);
-      }
-      break;
-    case LITE_POWER_RAND_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_RAND_HIGH;
-        if (threads > big_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.big_core_ids_[(i + shift_num) % big_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(WARNING)
-            << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_RAND_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_RAND_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.little_core_ids_[(i + shift_num) % small_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-  }
-  //! fix multi-threads LITE_POWER_HIGH mode
-  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
-    int threads = active_ids_.size();
-    omp_set_num_threads(threads);
-  } else {
-    if (check_online(active_ids_)) {
-      BindDev();
-    } else {
-      LOG(ERROR) << "core id " << active_ids_[0]
-                 << " is offline, switch to NO BIND MODE";
-      int threads = active_ids_.size();
-      omp_set_num_threads(threads);
-    }
-  }
-#else
-  if (big_core_size > 0) {
-    active_ids_ = {dev.big_core_ids_[0]};
-  } else {
-    active_ids_ = {0};
-  }
-#endif
-  //! alloc memory for sgemm in this context
-  int temp_mem_size =
-      DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
-  workspace_.Resize({temp_mem_size});
-  arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
-}
-
-ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
-
-void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
-
-int Context<TargetType::kARM>::l1_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L1_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l2_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L2_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l3_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L3_cache_[active_ids_[0]];
-}
-
-bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
-  auto count = dims.product();
-  auto old = workspace_.dims();
-  if (count == old.product()) {
-    return false;
-  }
-
-  workspace_.Resize(
-      {static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
-  return true;
-}
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
+namespace lite {}  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
 template <>
 class Context<TargetType::kARM> {
 public:
-  Context();
-  Context(PowerMode mode, int threads);
+  Context() {}
  explicit Context(const ARMContext& ctx);

-  ARMContext& operator=(const ARMContext& ctx);
+  ARMContext& operator=(const ARMContext& ctx) {}

  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() { DeviceInfo::Init(); }

  void CopyShared(const ARMContext* ctx) {}

-  void SetRunMode(PowerMode mode, int threads);
-  void SetCache(int l1size, int l2size, int l3size);
-  void SetArch(ARMArch arch);
-  void BindDev();
+  void SetRunMode(PowerMode mode, int threads) {
+    return DeviceInfo::Global().SetRunMode(mode, threads);
+  }
+  void SetCache(int l1size, int l2size, int l3size) {
+    return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
+  }
+  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }

-  PowerMode mode() const;
-  int threads() const;
-  ARMArch arch() const;
+  PowerMode mode() const { return DeviceInfo::Global().mode(); }
+  int threads() const { return DeviceInfo::Global().threads(); }
+  ARMArch arch() const { return DeviceInfo::Global().arch(); }
+  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
+  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
+  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }

  template <typename T>
  T* workspace_data() {
-    return workspace_.mutable_data<T>();
+    return DeviceInfo::Global().workspace_data<T>();
  }

-  int l1_cache_size() const;
-  int l2_cache_size() const;
-  int l3_cache_size() const;
-  bool ExtendWorkspace(DDimLite dims);
+  bool ExtendWorkspace(DDimLite dims) {
+    return DeviceInfo::Global().ExtendWorkspace(dims);
+  }

  std::string name() const { return "ARMContext"; }
-
- private:
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  ARMArch arch_;
-  PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
 };
 #endif


--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
@@ -12,312 +12,81 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#ifdef LITE_WITH_LINUX
+#include <sys/syscall.h>
+#include <unistd.h>
+#endif
+#if __APPLE__
+#include "TargetConditionals.h"
+#if TARGET_OS_IPHONE
+#include <mach/machine.h>
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#endif  // TARGET_OS_IPHONE
+#endif  // __APPLE__
+
+#ifdef ARM_WITH_OMP
+#include <omp.h>
+#endif
+
+#include <algorithm>
+#include <limits>
 #include "paddle/fluid/lite/core/cpu_info.h"
-#include <cstdarg>

 namespace paddle {
 namespace lite {

 #ifdef LITE_WITH_ARM

-void DeviceInfo::InitInternal(DeviceInfo* dev) {
-  set_default_cache(dev);
-  dev->compute_core_num_ = arm_get_cpucount();
-  dev->max_memory_ = arm_get_meminfo();
-
-// get max freq
-#ifdef LITE_WITH_LINUX
-  std::vector<int> max_freq(dev->compute_core_num_);
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    max_freq[i] = get_max_freq_khz(i) / 1000;
-  }
-  std::string cpu_name = arm_get_cpu_name();
-  if (get_cpu_info_from_name(dev, cpu_name) != true) {
-    arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_,
-                                    max_freq, &dev->cluster_ids_);
-    dev->big_core_ids_.clear();
-    dev->little_core_ids_.clear();
-    for (int i = 0; i < dev->cluster_ids_.size(); ++i) {
-      if (dev->cluster_ids_[i] == 0) {
-        dev->big_core_ids_.push_back(dev->core_ids_[i]);
-      } else {
-        dev->little_core_ids_.push_back(dev->core_ids_[i]);
-      }
-    }
-    arm_get_cpu_arch(&dev->archs_);
-  }
-
-  LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_;
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i]
-              << ", frequence: " << max_freq[i]
-              << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
-              << ", CPU ARCH: A" << dev->archs_[i];
-  }
-  VLOG(1) << "L1 DataCache size is: ";
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
-  }
-  VLOG(1) << "L2 Cache size is: ";
-  for (int i = 0; i < dev->compute_core_num_; ++i) {
-    VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
-  }
-  VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
-
-  dev->max_freq_ = max_freq[0];
-  for (int j = 1; j < dev->compute_core_num_; ++j) {
-    if (dev->max_freq_ < max_freq[j]) {
-      dev->max_freq_ = max_freq[j];
-    }
-  }
-#elif defined(TARGET_IOS)
-  arm_get_cpu_arch(&dev->archs_);
+#ifdef TARGET_IOS
+const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
+#else
+const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
+const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
+const int DEFAULT_L3_CACHE_SIZE = 0;
 #endif
-}

-// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
-void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  std::vector<int>* cache;
-  switch (cache_id) {
-    case 0:
-      cache = &cpu_info->L1_cache_;
-      break;
-    case 1:
-      cache = &cpu_info->L2_cache_;
-      break;
-    case 2:
-      cache = &cpu_info->L3_cache_;
-      break;
-    default:
+int get_cpu_num() {
+#ifdef LITE_WITH_LINUX
+  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
+  int max_cpu_num = 20;
+  int cpu_num = 0;
+  for (int i = 0; i < max_cpu_num; ++i) {
+    char path[256];
+    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
+    FILE* fp = fopen(path, "rb");
+    if (!fp) {
      break;
-  }
-  int core_num = cpu_info->compute_core_num_;
-  cache->resize(core_num);
-  if (argc == 1) {
-    int cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
-      (*cache)[i] = cache_size;
-    }
-  } else {
-    int big_core_num = cpu_info->big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
-    int big_core_cache_size = va_arg(arg_ptr, int);
-    int little_core_cache_size = va_arg(arg_ptr, int);
-    for (int i = 0; i < big_core_num; ++i) {
-      (*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      (*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size;
    }
+    cpu_num++;
+    fclose(fp);
  }
-  va_end(arg_ptr);
-}
-
-void set_arch_info(DeviceInfo* cpu_info, int argc, ...) {
-  va_list arg_ptr;
-  va_start(arg_ptr, argc);
-  int core_num = cpu_info->compute_core_num_;
-  cpu_info->archs_.resize(core_num);
-  if (argc == 1) {
-    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
-    for (int i = 0; i < core_num; ++i) {
-      cpu_info->archs_[i] = arch;
-    }
-  } else {
-    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
-    int big_core_num = cpu_info->big_core_ids_.size();
-    int little_core_num = cpu_info->little_core_ids_.size();
-    for (int i = 0; i < big_core_num; ++i) {
-      cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch;
-    }
-    for (int i = 0; i < little_core_num; ++i) {
-      cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch;
-    }
+  if (cpu_num < 1) {
+    cpu_num = 1;
  }
-  va_end(arg_ptr);
-}
-
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) {
-  /* Snapdragon */
-  if (hardware_name.find("SDM845") != std::string::npos) {  // 845
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024);
-    set_cache_info(cpu_info, 2, 1, 2048 * 1024);
-    return true;
-
-  } else if (hardware_name.find("SDM710") != std::string::npos) {  // 710
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA75, kA55);
-    return true;
-  } else if (hardware_name.find("MSM8998") != std::string::npos) {  // 835
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024,
-                   /*real cache size is 2M, while that will get bad performace
-                      on conv3x3s1 or gemm, set to 1M or 512K*/
-                   1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8996") != std::string::npos) {  // 820
-    cpu_info->compute_core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {2, 3};
-    cpu_info->little_core_ids_ = {0, 1};
-    cpu_info->cluster_ids_ = {1, 1, 0, 0};
-    set_arch_info(cpu_info, 1, kA72);
-    set_cache_info(cpu_info, 0, 1, 24 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("SDM660") != std::string::npos ||
-             hardware_name.find("SDM636") != std::string::npos) {  // 660, 636
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA73);
-    set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8976") != std::string::npos) {  // 652,653
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8953") != std::string::npos) {  // 625
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 1, 1024 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MSM8939") != std::string::npos) {  // 615
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3};
-    cpu_info->little_core_ids_ = {4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
-    set_arch_info(cpu_info, 1, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
-    return true;
-
-    /* MediaTek */
-
-  } else if (hardware_name.find("MT6797") !=
-             std::string::npos) {  // X20/X23/X25/X27
-    cpu_info->compute_core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    cpu_info->big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA72, kA53);
-    set_cache_info(cpu_info, 0, 1, 32 * 1024);
-    set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
-    return true;
-
-  } else if (hardware_name.find("MT6799") != std::string::npos) {  // X30
-    cpu_info->compute_core_num_ = 10;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-    cpu_info->big_core_ids_ = {8, 9};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6795") != std::string::npos ||
-             hardware_name.find("MT6762") != std::string::npos ||
-             hardware_name.find("MT6755T") != std::string::npos ||
-             hardware_name.find("MT6755S") != std::string::npos ||
-             hardware_name.find("MT6753") != std::string::npos ||
-             hardware_name.find("MT6752") != std::string::npos ||
-             hardware_name.find("MT6750") != std::string::npos) {
-    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6758") != std::string::npos ||
-             hardware_name.find("MT6757") != std::string::npos ||
-             hardware_name.find("MT6763") != std::string::npos ||
-             hardware_name.find("MT6755M") != std::string::npos ||
-             hardware_name.find("MT6755") !=
-                 std::string::npos) {  // P30, P20/P25, P23, P10
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6771") != std::string::npos) {  // P60
-    cpu_info->compute_core_num_ = 8;
-    cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
-    cpu_info->big_core_ids_ = {4, 5, 6, 7};
-    cpu_info->little_core_ids_ = {0, 1, 2, 3};
-    cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
-    set_arch_info(cpu_info, 2, kA73, kA53);
-    return true;
-
-  } else if (hardware_name.find("MT6765") != std::string::npos ||
-             hardware_name.find("MT6739") != std::string::npos ||
-             hardware_name.find("MT6738") != std::string::npos ||
-             hardware_name.find("MT6737") !=
-                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
-    cpu_info->compute_core_num_ = 4;
-    cpu_info->core_ids_ = {0, 1, 2, 3};
-    cpu_info->big_core_ids_ = {0, 0, 0, 0};
-    cpu_info->little_core_ids_ = {};
-    cpu_info->cluster_ids_ = {0, 0, 0, 0};
-    set_arch_info(cpu_info, 1, kA53);
-    return true;
+  return cpu_num;
+#elif defined(TARGET_IOS)
+  int cpu_num = 0;
+  size_t len = sizeof(cpu_num);
+  sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
+  if (cpu_num < 1) {
+    cpu_num = 1;
  }
-  return false;
+  return cpu_num;
+#else
+  return 1;
+#endif
 }

-size_t arm_get_meminfo() {
+size_t get_mem_size() {
 #ifdef LITE_WITH_LINUX
  // get cpu count from /proc/cpuinfo
  FILE* fp = fopen("/proc/meminfo", "rb");
  if (!fp) {
    return 1;
  }
-
  size_t memsize = 0;
  char line[1024];
  while (!feof(fp)) {
@@ -327,52 +96,18 @@ size_t arm_get_meminfo() {
    }
    sscanf(s, "MemTotal:        %d kB", &memsize);
  }
-
  fclose(fp);
-
  return memsize;
 #elif defined(TARGET_IOS)
  // to be implemented
  printf("not implemented\n");
-  return 0;
-#endif
-}
-
-int arm_get_cpucount() {
-#ifdef LITE_WITH_LINUX
-  // get cpu count from /sys/devices/system/cpu/cpunum/uevent
-  int max_cpu_count = 20;
-  int count = 0;
-  for (int i = 0; i < max_cpu_count; ++i) {
-    char path[256];
-    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
-    FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      break;
-    }
-    count++;
-    fclose(fp);
-  }
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#elif defined(TARGET_IOS)
-  int count = 0;
-  size_t len = sizeof(count);
-  sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
-  if (count < 1) {
-    count = 1;
-  }
-  return count;
-#else
-  return 1;
 #endif
+  return 0;
 }

-void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
-#ifdef LITE_WITH_LINUX
+void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
  archs->clear();
+#ifdef LITE_WITH_LINUX
  //! get CPU ARCH
  FILE* fp = fopen("/proc/cpuinfo", "rb");
  if (!fp) {
@@ -406,6 +141,29 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
        case 0xd0a:
          archs->push_back(kA75);
          break;
+        case 0xd40:
+          archs->push_back(kA76);
+          break;
+        case 0x804:
+          // 855
+          archs->push_back(kA76);
+          break;
+        case 0x805:
+          // 855
+          archs->push_back(kA55);
+          break;
+        case 0x802:
+          // 845
+          archs->push_back(kA75);
+          break;
+        case 0x803:
+          // 845
+          archs->push_back(kA55);
+          break;
+        case 0x801:
+          // 835
+          archs->push_back(kA73);
+          break;
        case 0x800:
          // 835
          archs->push_back(kA73);
@@ -415,49 +173,31 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
          archs->push_back(kA72);
          break;
        default:
-          LOG(ERROR) << "unknow type";
+          LOG(ERROR) << "Unknow cpu arch: " << arch_id;
          archs->push_back(kARMArch_UNKOWN);
      }
    }
  }
  fclose(fp);
-  int cpu_count = arm_get_cpucount();
-  if (archs->size() < cpu_count) {
-    for (int i = archs->size(); i < cpu_count; ++i) {
+  if (archs->size() < cpu_num) {
+    for (int i = archs->size(); i < cpu_num; ++i) {
      archs->push_back(archs->at(i - 1));
    }
  }
-#endif
-#ifdef TARGET_IOS
-  int cpu_count = arm_get_cpucount();
-  for (int i = 0; i < cpu_count; ++i) {
+#elif defined(TARGET_IOS)
+  for (int i = 0; i < cpu_num; ++i) {
    archs->push_back(APPLE);
  }
+#else
+  for (int i = 0; i < cpu_num; ++i) {
+    archs->push_back(kARMArch_UNKOWN);
+  }
 #endif
 }

 #ifdef LITE_WITH_LINUX

-void set_default_cache(DeviceInfo* dev) {
-  int cpu_count = arm_get_cpucount();
-  dev->L1_cache_.resize(cpu_count);
-  dev->L2_cache_.resize(cpu_count);
-  dev->L3_cache_.resize(cpu_count);
-#ifdef TARGET_IOS
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 64 * 1024;
-    dev->L2_cache_[i] = 2048 * 1024;
-    dev->L3_cache_[i] = 0;
-  }
-#else
-  for (int i = 0; i < cpu_count; ++i) {
-    dev->L1_cache_[i] = 32 * 1024;
-    dev->L2_cache_[i] = 512 * 1024;
-    dev->L3_cache_[i] = 0;
-  }
-#endif
-}
-std::string arm_get_cpu_name() {
+std::string get_cpu_name() {
  FILE* fp = fopen("/proc/cpuinfo", "rb");
  if (!fp) {
    return "";
@@ -477,122 +217,163 @@ std::string arm_get_cpu_name() {
  return "";
 }

-int get_max_freq_khz(int cpuid) {
+void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
+  *max_freq = 0;
+  *min_freq = 0;
  // first try, for all possible cpu
  char path[256];
  snprintf(path, sizeof(path),
-           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
-
+           "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
  FILE* fp = fopen(path, "rb");
-
  if (!fp) {
    // second try, for online cpu
    snprintf(path, sizeof(path),
             "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
-             cpuid);
+             cpu_id);
    fp = fopen(path, "rb");
-
    if (!fp) {
      // third try, for online cpu
+      // get max_freq
      snprintf(path, sizeof(path),
-               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
+               cpu_id);
      fp = fopen(path, "rb");
-
      if (!fp) {
-        return -1;
+        return;
      }
-
-      int max_freq_khz = -1;
-      fscanf(fp, "%d", &max_freq_khz);
-
+      fscanf(fp, "%d", max_freq);
      fclose(fp);
-
-      return max_freq_khz;
+      // get min_freq
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
+               cpu_id);
+      fp = fopen(path, "rb");
+      if (!fp) {
+        return;
+      }
+      fscanf(fp, "%d", min_freq);
+      fclose(fp);
+      return;
    }
  }
-
-  int max_freq_khz = 0;
+  *min_freq = std::numeric_limits<int>::max();
  while (!feof(fp)) {
-    int freq_khz = 0;
-    int nscan = fscanf(fp, "%d %*d", &freq_khz);
+    int freq = 0;
+    int nscan = fscanf(fp, "%d %*d", &freq);
    if (nscan != 1) {
      break;
    }
-
-    if (freq_khz > max_freq_khz) {
-      max_freq_khz = freq_khz;
+    if (freq > *max_freq) {
+      *max_freq = freq;
+    }
+    if (freq < *min_freq) {
+      *min_freq = freq;
    }
  }
-
  fclose(fp);
-
-  return max_freq_khz;
 }

-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids) {
-  if (cpu_count == 0) {
-    return 0;
+void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
+                            std::vector<int>* cpu_ids,
+                            std::vector<int>* cluster_ids) {
+  int cpu_num = max_freqs.size();
+  if (cpu_num == 0) {
+    return;
  }
-
-  cpuids->resize(cpu_count);
-  cluster_ids->resize(cpu_count);
-
-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
+  cpu_ids->resize(cpu_num);
+  cluster_ids->resize(cpu_num);
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
  }
-
  // sort cpuid as big core first
  // simple bubble sort
-
-  for (int i = 0; i < cpu_count; i++) {
-    for (int j = i + 1; j < cpu_count; j++) {
-      if (cpu_freq[i] < cpu_freq[j]) {
+  for (int i = 0; i < cpu_num; i++) {
+    for (int j = i + 1; j < cpu_num; j++) {
+      if (max_freqs[i] < max_freqs[j]) {
        // swap
-        int tmp = cpuids->at(i);
-        cpuids->at(i) = cpuids->at(j);
-        cpuids->at(j) = tmp;
+        int tmp = cpu_ids->at(i);
+        cpu_ids->at(i) = cpu_ids->at(j);
+        cpu_ids->at(j) = tmp;
      }
    }
  }
  // SMP
-  int mid_max_freq_khz =
-      (cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
+  int mid_max_freq =
+      (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;

-  for (int i = 0; i < cpu_count; i++) {
-    cpuids->at(i) = i;
-    if (cpu_freq[i] >= mid_max_freq_khz) {
+  for (int i = 0; i < cpu_num; i++) {
+    cpu_ids->at(i) = i;
+    if (max_freqs[i] >= mid_max_freq) {
      cluster_ids->at(i) = 0;
    } else {
      cluster_ids->at(i) = 1;
    }
  }
-  return 0;
 }

-int check_online(const std::vector<int>& core_ids) {
-  if (core_ids.size() == 0) {
-    return 0;
+void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
+                        int* l3_cache_size) {
+  int max_cache_idx_num = 10;
+  *l1_cache_size = DEFAULT_L1_CACHE_SIZE;
+  *l2_cache_size = DEFAULT_L2_CACHE_SIZE;
+  *l3_cache_size = DEFAULT_L3_CACHE_SIZE;
+  for (int i = 0; i < max_cache_idx_num; i++) {
+    char path[256];
+    snprintf(path, sizeof(path),
+             "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
+    FILE* fp = fopen(path, "rb");
+    if (fp) {
+      int level = -1;
+      fscanf(fp, "%d", &level);
+      fclose(fp);
+      snprintf(path, sizeof(path),
+               "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
+      fp = fopen(path, "rb");
+      if (fp) {
+        int size = -1;
+        fscanf(fp, "%d", &size);
+        fclose(fp);
+        if (size >= 0) {
+          if (level == 1) {
+            *l1_cache_size = size * 1024;
+          } else if (level == 2) {
+            *l2_cache_size = size * 1024;
+          } else if (level == 3) {
+            *l3_cache_size = size * 1024;
+          }
+        }
+      }
+    }
+  }
+}
+
+bool check_cpu_online(const std::vector<int>& cpu_ids) {
+  if (cpu_ids.size() == 0) {
+    return false;
  }
  char path[256];
-  int online = 1;
-  for (int i = 0; i < core_ids.size(); ++i) {
+  bool all_online = true;
+  for (int i = 0; i < cpu_ids.size(); ++i) {
    snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
-             core_ids[i]);
+             cpu_ids[i]);
    FILE* fp = fopen(path, "rb");
-    if (!fp) {
-      return 0;
+    int is_online = 0;
+    if (fp) {
+      fscanf(fp, "%d", &is_online);
+      fclose(fp);
+    } else {
+      LOG(ERROR) << "Failed to query the online statue of CPU id:"
+                 << cpu_ids[i];
+    }
+    if (is_online == 0) {
+      all_online = false;
+      LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
    }
-    int cur_online = 0;
-    fscanf(fp, "%d", &cur_online);
-    online &= cur_online;
-    fclose(fp);
  }
-  return online;
+  return all_online;
 }

-int set_sched_affinity(const std::vector<int>& cpuids) {
+int set_sched_affinity(const std::vector<int>& cpu_ids) {
 // #define CPU_SETSIZE 1024
 // #define __NCPUBITS  (8 * sizeof (unsigned long))
 // typedef struct
@@ -608,20 +389,569 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
 #endif
  cpu_set_t mask;
  CPU_ZERO(&mask);
-  for (int i = 0; i < cpuids.size(); i++) {
-    CPU_SET(cpuids[i], &mask);
+  for (int i = 0; i < cpu_ids.size(); ++i) {
+    CPU_SET(cpu_ids[i], &mask);
  }
-
  int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
  if (syscallret) {
-    LOG(ERROR) << "syscall error " << syscallret;
    return -1;
  }
+  return 0;
+}
+
+bool bind_threads(const std::vector<int> cpu_ids) {
+#ifdef ARM_WITH_OMP
+  int thread_num = cpu_ids.size();
+  omp_set_num_threads(thread_num);
+  std::vector<int> ssarets;
+  for (int i = 0; i < thread_num; ++i) {
+    ssarets.push_back(0);
+  }
+#pragma omp parallel for
+  for (int i = 0; i < thread_num; i++) {
+    ssarets[i] = set_sched_affinity(cpu_ids);
+  }
+  for (int i = 0; i < thread_num; i++) {
+    if (ssarets[i] != 0) {
+      LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
+      return false;
+    }
+  }
+#else   // ARM_WITH_OMP
+  std::vector<int> first_cpu_id;
+  first_cpu_id.push_back(cpu_ids[0]);
+  int ssaret = set_sched_affinity(first_cpu_id);
+  if (ssaret != 0) {
+    LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
+    return false;
+  }
+#endif  // ARM_WITH_OMP
+}
+
+#endif  // LITE_WITH_LINUX
+
+// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
+void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
+  va_list arg_ptr;
+  va_start(arg_ptr, argc);
+  std::vector<int>* cache;
+  switch (cache_id) {
+    case 0:
+      cache = &L1_cache_;
+      break;
+    case 1:
+      cache = &L2_cache_;
+      break;
+    case 2:
+      cache = &L3_cache_;
+      break;
+    default:
+      break;
+  }
+  cache->resize(core_num_);
+  if (argc == 1) {
+    int cache_size = va_arg(arg_ptr, int);
+    for (int i = 0; i < core_num_; ++i) {
+      (*cache)[i] = cache_size;
+    }
+  } else {
+    int big_core_num = big_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
+    int big_core_cache_size = va_arg(arg_ptr, int);
+    int little_core_cache_size = va_arg(arg_ptr, int);
+    for (int i = 0; i < big_core_num; ++i) {
+      (*cache)[big_core_ids_[i]] = big_core_cache_size;
+    }
+    for (int i = 0; i < little_core_num; ++i) {
+      (*cache)[little_core_ids_[i]] = little_core_cache_size;
+    }
+  }
+  va_end(arg_ptr);
+}
+
+void DeviceInfo::SetArchInfo(int argc, ...) {
+  va_list arg_ptr;
+  va_start(arg_ptr, argc);
+  archs_.resize(core_num_);
+  if (argc == 1) {
+    ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
+    for (int i = 0; i < core_num_; ++i) {
+      archs_[i] = arch;
+    }
+  } else {
+    ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
+    ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
+    int big_core_num = big_core_ids_.size();
+    int little_core_num = little_core_ids_.size();
+    for (int i = 0; i < big_core_num; ++i) {
+      archs_[big_core_ids_[i]] = big_core_arch;
+    }
+    for (int i = 0; i < little_core_num; ++i) {
+      archs_[little_core_ids_[i]] = little_core_arch;
+    }
+  }
+  va_end(arg_ptr);
+}
+
+bool DeviceInfo::SetCPUInfoByName() {
+  /* Snapdragon */
+  if (dev_name_.find("SM8150") != std::string::npos) {  // 855
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA76, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM845") != std::string::npos) {  // 845
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 2048 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM710") != std::string::npos) {  // 710
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {6, 7};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA75, kA55);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
+    SetCacheInfo(2, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8998") != std::string::npos) {  // 835
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024,
+                 /*real cache size is 2M, while that will get bad performace
+                    on conv3x3s1 or gemm, set to 1M or 512K*/
+                 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8996") != std::string::npos) {  // 820
+    core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {2, 3};
+    little_core_ids_ = {0, 1};
+    cluster_ids_ = {1, 1, 0, 0};
+    SetArchInfo(1, kA72);
+    SetCacheInfo(0, 1, 24 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("SDM660") != std::string::npos ||
+             dev_name_.find("SDM636") != std::string::npos) {  // 660, 636
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA73);
+    SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8976") != std::string::npos) {  // 652,653
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8953") != std::string::npos) {  // 625
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 1, 1024 * 1024);
+    return true;
+  } else if (dev_name_.find("MSM8939") != std::string::npos) {  // 615
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {4, 5, 6, 7};
+    cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
+    SetArchInfo(1, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
+    return true;
+    /* MediaTek */
+  } else if (dev_name_.find("MT6797") !=
+             std::string::npos) {  // X20/X23/X25/X27
+    core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA72, kA53);
+    SetCacheInfo(0, 1, 32 * 1024);
+    SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
+    return true;
+  } else if (dev_name_.find("MT6799") != std::string::npos) {  // X30
+    core_num_ = 10;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
+    big_core_ids_ = {8, 9};
+    little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    return true;
+  } else if (dev_name_.find("MT6795") != std::string::npos ||
+             dev_name_.find("MT6762") != std::string::npos ||
+             dev_name_.find("MT6755T") != std::string::npos ||
+             dev_name_.find("MT6755S") != std::string::npos ||
+             dev_name_.find("MT6753") != std::string::npos ||
+             dev_name_.find("MT6752") != std::string::npos ||
+             dev_name_.find("MT6750") != std::string::npos) {
+    // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  } else if (dev_name_.find("MT6758") != std::string::npos ||
+             dev_name_.find("MT6757") != std::string::npos ||
+             dev_name_.find("MT6763") != std::string::npos ||
+             dev_name_.find("MT6755M") != std::string::npos ||
+             dev_name_.find("MT6755") !=
+                 std::string::npos) {  // P30, P20/P25, P23, P10
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  } else if (dev_name_.find("MT6771") != std::string::npos) {  // P60
+    core_num_ = 8;
+    core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
+    big_core_ids_ = {4, 5, 6, 7};
+    little_core_ids_ = {0, 1, 2, 3};
+    cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
+    SetArchInfo(2, kA73, kA53);
+    return true;
+  } else if (dev_name_.find("MT6765") != std::string::npos ||
+             dev_name_.find("MT6739") != std::string::npos ||
+             dev_name_.find("MT6738") != std::string::npos ||
+             dev_name_.find("MT6737") !=
+                 std::string::npos) {  // A22, MT6739, MT6738, MT6767
+    core_num_ = 4;
+    core_ids_ = {0, 1, 2, 3};
+    big_core_ids_ = {0, 1, 2, 3};
+    little_core_ids_ = {};
+    cluster_ids_ = {0, 0, 0, 0};
+    SetArchInfo(1, kA53);
+    return true;
+  }
+  return false;
+}
+
+void DeviceInfo::SetCPUInfoByProb() {
+#ifdef LITE_WITH_LINUX
+  // get big.LITTLE cores by sorting CPU frequency
+  sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
+  big_core_ids_.clear();
+  little_core_ids_.clear();
+  for (int i = 0; i < cluster_ids_.size(); ++i) {
+    if (cluster_ids_[i] == 0) {
+      big_core_ids_.push_back(core_ids_[i]);
+    } else {
+      little_core_ids_.push_back(core_ids_[i]);
+    }
+  }
+  // get l1, l2, l3 cache size for each core
+  for (int i = 0; i < core_num_; i++) {
+    get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
+  }
+#endif  // LITE_WITH_LINUX
+}
+
+void DeviceInfo::RequestPowerFullMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  for (int i = 0; i < thread_num; ++i) {
+    if (i < big_core_size) {
+      active_ids_.push_back(big_core_ids_[i]);
+    } else if (i < big_core_size + little_core_size) {
+      active_ids_.push_back(little_core_ids_[i - big_core_size]);
+    }
+  }
+  mode_ = LITE_POWER_FULL;
+}
+
+void DeviceInfo::RequestPowerHighMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (big_core_size > 0) {
+    mode_ = LITE_POWER_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(ERROR) << "Request thread num: " << thread_num
+                 << ", exceed the big cores size: " << big_core_size
+                 << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_LOW;
+    LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
+    if (thread_num > little_core_size) {
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerLowMode(const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_LOW;
+    if (thread_num > little_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
+                   << ", truncate thread num to " << little_core_size;
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; i++) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
+  active_ids_.clear();
+  for (int i = 0; i < thread_num; i++) {
+    active_ids_.push_back(0);
+  }
+  mode_ = LITE_POWER_NO_BIND;
+}
+
+void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
+                                          const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  if (big_core_size > 0) {
+    mode_ = LITE_POWER_RAND_HIGH;
+    if (thread_num > big_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the big cores size: " << big_core_size
+                   << ", truncate thread num to " << big_core_size;
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_LOW;
+    LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
+    if (thread_num > little_core_size) {
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(little_core_ids_[i]);
+      }
+    }
+  }
+}
+
+void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
+                                         const int thread_num) {
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  active_ids_.clear();
+  if (little_core_size > 0) {
+    mode_ = LITE_POWER_RAND_LOW;
+    if (thread_num > little_core_size) {
+      LOG(WARNING) << "Request thread num: " << thread_num
+                   << ", exceed the little cores size: " << little_core_size
+                   << ", truncate thread num to " << little_core_size;
+      active_ids_ = little_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(
+            little_core_ids_[(i + shift_num) % little_core_size]);
+      }
+    }
+  } else {
+    mode_ = LITE_POWER_HIGH;
+    LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
+    if (thread_num > big_core_size) {
+      active_ids_ = big_core_ids_;
+    } else {
+      for (int i = 0; i < thread_num; ++i) {
+        active_ids_.push_back(big_core_ids_[i]);
+      }
+    }
+  }
+}

+int DeviceInfo::Setup() {
+  core_num_ = get_cpu_num();
+  mem_size_ = get_mem_size();
+  get_cpu_arch(&archs_, core_num_);
+  // set defalut CPU info
+  SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE);
+  SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE);
+  SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE);
+#ifdef LITE_WITH_LINUX
+  // get max&min freq
+  max_freqs_.resize(core_num_);
+  min_freqs_.resize(core_num_);
+  for (int i = 0; i < core_num_; ++i) {
+    int max_freq, min_freq;
+    get_cpu_max_min_freq(i, &max_freq, &min_freq);
+    max_freqs_[i] = max_freq / 1000;
+    min_freqs_[i] = min_freq / 1000;
+  }
+  // get cache size and big.LITTLE core ids
+  dev_name_ = get_cpu_name();
+  if (!SetCPUInfoByName()) {
+    SetCPUInfoByProb();
+  }
+  // output info
+  LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
+  LOG(INFO) << "ARM multiprocessors number: " << core_num_;
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
+              << ", max freq: " << max_freqs_[i]
+              << ", min freq: " << min_freqs_[i]
+              << ", cluster ID: " << cluster_ids_[core_ids_[i]]
+              << ", CPU ARCH: A" << archs_[i];
+  }
+  LOG(INFO) << "L1 DataCache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L1_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "L2 Cache size is: ";
+  for (int i = 0; i < core_num_; ++i) {
+    LOG(INFO) << L2_cache_[i] / 1024 << " KB";
+  }
+  LOG(INFO) << "Total memory: " << mem_size_ << "KB";
+#endif
+  // set default run mode
+  SetRunMode(LITE_POWER_NO_BIND, 1);  // use single thread by default
  return 0;
 }

+void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
+#ifdef ARM_WITH_OMP
+  thread_num = std::min(thread_num, core_num_);
+#else
+  thread_num = 1;  // force thread_num to 1 if OpenMP is disabled
+#endif
+#ifdef LITE_WITH_LINUX
+  int big_core_size = big_core_ids_.size();
+  int little_core_size = little_core_ids_.size();
+  int big_little_core_size = big_core_size + little_core_size;
+  thread_num = std::min(thread_num, big_little_core_size);
+  count_++;
+  int shift_num = (count_ / 10) % big_core_size;
+  switch (mode) {
+    case LITE_POWER_FULL:
+      RequestPowerFullMode(thread_num);
+      break;
+    case LITE_POWER_HIGH:
+      RequestPowerHighMode(thread_num);
+      break;
+    case LITE_POWER_LOW:
+      RequestPowerLowMode(thread_num);
+      break;
+    case LITE_POWER_NO_BIND:
+      RequestPowerNoBindMode(thread_num);
+      break;
+    case LITE_POWER_RAND_HIGH:
+      RequestPowerRandHighMode(shift_num, thread_num);
+      break;
+    case LITE_POWER_RAND_LOW:
+      RequestPowerRandLowMode(shift_num, thread_num);
+      break;
+    default:
+      LOG(FATAL) << "Unsupported power mode: " << mode;
+      break;
+  }
+  if (active_ids_.size() == 0) {
+    active_ids_.push_back(0);
+  }
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
+  if (mode_ != LITE_POWER_NO_BIND) {
+    if (check_cpu_online(active_ids_)) {
+      bind_threads(active_ids_);
+    } else {
+      LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
+      mode_ = LITE_POWER_NO_BIND;
+    }
+  }
+#else  // LITE_WITH_LINUX
+  // only LITE_POWER_NO_BIND is supported in other OS
+  RequestPowerNoBindMode(thread_num);
+#ifdef ARM_WITH_OMP
+  omp_set_num_threads(active_ids_.size());
+#endif
 #endif  // LITE_WITH_LINUX
+  //! alloc memory for sgemm in this context
+  workspace_.Resize(
+      {static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
+  arch_ = archs_[active_ids_[0]];
+}
+
+void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
+  SetCacheInfo(0, l1size);
+  SetCacheInfo(1, l2size);
+  SetCacheInfo(2, l3size);
+  workspace_.Resize({2 * (l1size + l2size)});
+}
+
+bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
+  auto count = dims.product();
+  auto old = workspace_.dims();
+  if (count == old.product()) {
+    return false;
+  }
+  workspace_.Resize({static_cast<int64_t>(
+      count + L2_cache_[active_ids_[0]] / sizeof(float))});
+  return true;
+}

 #endif  // LITE_WITH_ARM


--- a/paddle/fluid/lite/core/cpu_info.h
+++ b/paddle/fluid/lite/core/cpu_info.h
@@ -14,24 +14,12 @@

 #pragma once

+#include <cstdarg>
 #include <string>
 #include <vector>
+#include "paddle/fluid/lite/core/lite_tensor.h"
 #include "paddle/fluid/lite/utils/cp_logging.h"

-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
 namespace paddle {
 namespace lite {

@@ -60,64 +48,73 @@ typedef enum {

 class DeviceInfo {
 public:
-  int idx_;
-  int max_freq_;
-  int min_freq_;
-  int generate_arch_;
-  int compute_core_num_;
-  int max_memory_;
-  int sharemem_size_;
-
-  std::string device_name_;
-  std::string compute_ability_;
-
-  std::vector<int> L1_cache_;
-  std::vector<int> L2_cache_;
-  std::vector<int> L3_cache_;
-  std::vector<int> core_ids_;
-  std::vector<int> big_core_ids_;
-  std::vector<int> little_core_ids_;
-  std::vector<int> cluster_ids_;
-  std::vector<ARMArch> archs_;
-
  static DeviceInfo& Global() {
    static auto* x = new DeviceInfo;
    return *x;
  }

-  static void Init() {
-    auto& info = Global();
-    InitInternal(&info);
+  static int Init() {
+    static int ret = Global().Setup();
+    return ret;
  }

- private:
-  DeviceInfo() = default;
-  static void InitInternal(DeviceInfo* dev);
-};
+  int Setup();

-size_t arm_get_meminfo();
+  void SetRunMode(PowerMode mode, int thread_num);
+  void SetCache(int l1size, int l2size, int l3size);
+  void SetArch(ARMArch arch) { arch_ = arch; }

-int arm_get_cpucount();
+  PowerMode mode() const { return mode_; }
+  int threads() const { return active_ids_.size(); }
+  ARMArch arch() const { return arch_; }
+  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
+  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
+  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }

-void arm_get_cpu_arch(std::vector<ARMArch>* archs);
-
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
-
-#ifdef LITE_WITH_LINUX
-
-void set_default_cache(DeviceInfo* dev);
+  template <typename T>
+  T* workspace_data() {
+    return workspace_.mutable_data<T>();
+  }
+  bool ExtendWorkspace(DDimLite dims);

-std::string arm_get_cpu_name();
+ private:
+  int core_num_;
+  std::vector<int> max_freqs_;
+  std::vector<int> min_freqs_;
+  int mem_size_;
+  std::string dev_name_;

-int get_max_freq_khz(int cpuid);
+  std::vector<int> L1_cache_;
+  std::vector<int> L2_cache_;
+  std::vector<int> L3_cache_;
+  std::vector<int> core_ids_;
+  std::vector<int> big_core_ids_;
+  std::vector<int> little_core_ids_;
+  std::vector<int> cluster_ids_;
+  std::vector<ARMArch> archs_;

-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids);
-int check_online(const std::vector<int>& core_ids);
-int set_sched_affinity(const std::vector<int>& cpuids);
+  ARMArch arch_;
+  // LITE_POWER_HIGH stands for using big cores,
+  // LITE_POWER_LOW stands for using small core,
+  // LITE_POWER_FULL stands for using all cores
+  PowerMode mode_;
+  std::vector<int> active_ids_;
+  TensorLite workspace_;
+  int64_t count_{0};
+
+  void SetCacheInfo(int cache_id, int argc, ...);
+  void SetArchInfo(int argc, ...);
+  bool SetCPUInfoByName();
+  void SetCPUInfoByProb();
+  void RequestPowerFullMode(const int thread_num);
+  void RequestPowerHighMode(const int thread_num);
+  void RequestPowerLowMode(const int thread_num);
+  void RequestPowerNoBindMode(const int thread_num);
+  void RequestPowerRandHighMode(const int shift_num, const int thread_num);
+  void RequestPowerRandLowMode(const int shift_num, const int thread_num);

-#endif  // LITE_WITH_LINUX
+  DeviceInfo() = default;
+};

 #endif  // LITE_WITH_ARM


--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
  auto o_dims = param.output->dims();

  auto& ctx = this->ctx_->template As<ARMContext>();
-  // TODO(xxx): make api and expose it
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);

  int win = x_dims[3];  // nchw
  int hin = x_dims[2];

--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
  auto w_dims = param.w->dims();

  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);

  CHECK_GE(x_dims.size(), 2UL);
  CHECK_EQ(w_dims.size(), 2UL);

--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -24,7 +24,6 @@ namespace arm {

 void MulCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }

 void MulCompute::Run() {

--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
@@ -26,7 +26,6 @@ namespace arm {

 void PoolCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }

 void PoolCompute::Run() {

--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps}
 cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
 cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
 cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
+cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})

 lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
@@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x
 lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator)
 lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
 lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
+lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)


 set(x86_kernels
@@ -44,6 +46,7 @@ set(x86_kernels
    concat_compute_x86
    conv_compute_x86 
    pool_compute_x86  
+    batch_norm_compute_x86 
    )

 set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")

--- a/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
+
+REGISTER_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::BatchNormCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/x86/batch_norm_compute.h
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include <random>
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+
+template <typename T>
+class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+  void Run() override {
+    auto &param = *param_.get_mutable<operators::BatchNormParam>();
+    bool global_stats = param.is_test || param.use_global_stats;
+
+    const auto *x = param.x;
+    const auto &x_dims = x->dims();
+    CHECK(x_dims.size() >= 2 && x_dims.size() <= 5);
+    const int N = x_dims[0];
+    const int C = param.data_layout == DATALAYOUT(kNCHW)
+                      ? x_dims[1]
+                      : x_dims[x_dims.size() - 1];
+    const int sample_size = x->dims().production() / N / C;
+
+    // alloc memory
+    param.y->template mutable_data<T>();
+    param.mean_out->template mutable_data<T>();
+    param.variance_out->template mutable_data<T>();
+    param.saved_mean->template mutable_data<T>();
+    param.saved_variance->template mutable_data<T>();
+
+    if (!global_stats) {
+      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
+                                          C);
+      EigenVectorArrayMap<T> saved_variance_e(
+          param.saved_variance->mutable_data<T>(), C);
+      saved_mean_e.setZero();
+      saved_variance_e.setZero();
+
+      EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
+                                              C);
+      EigenVectorArrayMap<T> running_var_arr(
+          param.variance_out->mutable_data<T>(), C);
+
+      if ((N * sample_size) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopy(x->raw_tensor(), platform::CPUPlace(),
+                              &param.y->raw_tensor());
+        return;
+      }
+
+      switch (param.data_layout) {
+        case DATALAYOUT(kNCHW): {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_mean_e(nc % C) += x_arr.col(nc).sum();
+          }
+          saved_mean_e /= N * sample_size;
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_variance_e(nc % C) +=
+                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unknown storage order: "
+                     << DataLayoutToStr(param.data_layout);
+          break;
+      }
+      running_mean_arr = running_mean_arr * param.momentum +
+                         saved_mean_e * (1. - param.momentum);
+      running_var_arr = running_var_arr * param.momentum +
+                        saved_variance_e * (1. - param.momentum);
+    }
+
+    // use SavedMean and SavedVariance to do normalize
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    if (global_stats) {
+      ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
+      inv_std = (var_arr + param.epsilon).sqrt().inverse();
+    } else {
+      EigenVectorArrayMap<T> saved_inv_std(
+          param.saved_variance->mutable_data<T>(), C);
+      // inverse SavedVariance first, gradient will use it too.
+      saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
+      inv_std = saved_inv_std;
+    }
+
+    ConstEigenVectorArrayMap<T> mean_arr(
+        global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
+
+    //   ((x - est_mean) * (inv_var) * scale + bias
+    //   formula transform ====>
+    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+
+    ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
+    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+        bias_arr - mean_arr * inv_std * scale_arr;
+
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW): {
+        EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+        }
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+  }
+  virtual ~BatchNormCompute() = default;
+};
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+
+TEST(batch_norm_x86, retrive_op) {
+  auto batch_norm =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "batch_norm");
+  ASSERT_FALSE(batch_norm.empty());
+  ASSERT_TRUE(batch_norm.front());
+}
+
+TEST(batch_norm_x86, init) {
+  BatchNormCompute<float> batch_norm;
+  ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(batch_norm.target(), TARGET(kX86));
+}
+
+TEST(batch_norm_x86, run_test) {
+  lite::Tensor x, scale, bias, mean, variance, y, mean_out, variance_out,
+      saved_mean, saved_variance;
+  constexpr int batch_size = 2;
+  std::vector<int64_t> x_shape{batch_size, 3, 64, 64};
+  x.Resize(lite::DDim(x_shape));
+
+  std::vector<int64_t> scale_shape{3};
+  scale.Resize(lite::DDim(scale_shape));
+
+  std::vector<int64_t> bias_shape{3};
+  bias.Resize(lite::DDim(bias_shape));
+
+  std::vector<int64_t> mean_shape{3};
+  mean.Resize(lite::DDim(mean_shape));
+
+  std::vector<int64_t> variance_shape{3};
+  variance.Resize(lite::DDim(variance_shape));
+
+  std::vector<int64_t> y_shape{batch_size, 3, 64, 64};
+  y.Resize(lite::DDim(y_shape));
+
+  std::vector<int64_t> mean_out_shape{3};
+  mean_out.Resize(lite::DDim(mean_out_shape));
+
+  std::vector<int64_t> variance_out_shape{3};
+  variance_out.Resize(lite::DDim(variance_out_shape));
+
+  std::vector<int64_t> saved_mean_shape{3};
+  saved_mean.Resize(lite::DDim(saved_mean_shape));
+
+  std::vector<int64_t> saved_variance_shape{3};
+  saved_variance.Resize(lite::DDim(saved_variance_shape));
+
+  auto x_data = x.mutable_data<float>();
+  auto scale_data = scale.mutable_data<float>();
+  auto bias_data = bias.mutable_data<float>();
+  auto mean_data = mean.mutable_data<float>();
+  auto variance_data = variance.mutable_data<float>();
+  y.mutable_data<float>();
+  mean_out.mutable_data<float>();
+  variance_out.mutable_data<float>();
+  saved_mean.mutable_data<float>();
+  saved_variance.mutable_data<float>();
+
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int i = 0; i < scale.dims().production(); i++) {
+    scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
+  }
+  for (int i = 0; i < bias.dims().production(); i++) {
+    bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
+  }
+  for (int i = 0; i < mean.dims().production(); i++) {
+    mean_data[i] = static_cast<float>(i) * 0.0565f;
+  }
+  for (int i = 0; i < variance.dims().production(); i++) {
+    variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
+  }
+  // BatchNormCompute batch_norm;
+  BatchNormCompute<float> batch_norm;
+  operators::BatchNormParam param;
+
+  param.x = &x;
+  param.is_test = false;
+  param.scale = &scale;
+  param.bias = &bias;
+  param.mean = &mean;
+  param.variance = &variance;
+  param.use_global_stats = false;
+  param.epsilon = 1e-4f;
+  param.momentum = 0.9f;
+  param.y = &y;
+  param.mean_out = &mean_out;
+  param.variance_out = &variance_out;
+  param.saved_mean = &saved_mean;
+  param.saved_variance = &saved_variance;
+
+  batch_norm.SetParam(param);
+  batch_norm.Run();
+
+  LOG(INFO) << "output: " << y;
+  LOG(INFO) << "mean_out: " << mean_out;
+  LOG(INFO) << "variance_out: " << mean_out;
+  LOG(INFO) << "saved_mean: " << saved_mean;
+  LOG(INFO) << "saved_variance: " << saved_variance;
+
+  /*for (int i = 0; i < y.dims().production(); i++) {
+    if(i < 5 || i > y.dims().production() - 5)
+      LOG(INFO) << y_data[i];
+  }*/
+}
+
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+
+USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -135,8 +135,8 @@ function test_arm_model {
    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
-    local adb_model_path="./${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "./${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
+    local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
+    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"

 }

@@ -225,16 +225,11 @@ function test_arm {
    for _test in $(cat $TESTS_FILE); do
        test_arm_android $_test $port
    done
-    # TODO(sangoly): refine this
-    test_arm_model "test_cxx_api_lite" $port "./third_party/install/mobilenet_v2_relu"
 }

-# Build the code and run lite arm tests. This is executed in the CI system.
-function build_test_arm {
-    ########################################################################
-    # job 1-4 must be in one runner
-    port_armv8=5554
-    port_armv7=5556
+function prepare_emulator {
+    local port_armv8=$1
+    local port_armv7=$2

    adb kill-server
    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
@@ -245,6 +240,18 @@ function build_test_arm {
    echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} &
    sleep 1m
+}
+
+
+# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
+# sub-task1
+function build_test_arm_subtask_android {
+    ########################################################################
+    # job 1-4 must be in one runner
+    port_armv8=5554
+    port_armv7=5556
+
+    prepare_emulator $port_armv8 $port_armv7

    # job 1
    build_arm "android" "armv8" "gcc"
@@ -252,9 +259,9 @@ function build_test_arm {
    cd -

    # job 2
-    build_arm "android" "armv8" "clang"
-    test_arm "android" "armv8" "clang" ${port_armv8}
-    cd -
+    #build_arm "android" "armv8" "clang"
+    #test_arm "android" "armv8" "clang" ${port_armv8}
+    #cd -

    # job 3
    build_arm "android" "armv7" "gcc"
@@ -262,13 +269,22 @@ function build_test_arm {
    cd -

    # job 4
-    build_arm "android" "armv7" "clang"
-    test_arm "android" "armv7" "clang" ${port_armv7}
-    cd -
+    #build_arm "android" "armv7" "clang"
+    #test_arm "android" "armv7" "clang" ${port_armv7}
+    #cd -

    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    echo "Done"
+}
+
+# sub-task2
+function build_test_arm_subtask_armlinux {
    ########################################################################
+    # job 1-4 must be in one runner
+    port_armv8=5554
+    port_armv7=5556
+
+    prepare_emulator $port_armv8 $port_armv7

    # job 5
    build_arm "armlinux" "armv8"
@@ -285,9 +301,47 @@ function build_test_arm {
    test_arm "armlinux" "armv7hf"
    cd -

+    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    echo "Done"
+}
+
+# sub-task3
+function build_test_arm_subtask3_mobilenet_v2 {
+    local port_armv8=5554
+    local port_armv7=5556
+    # We just test following single one environment to limit the CI time.
+    local os=android
+    local abi=armv8
+    local lang=gcc
+
+    cur_dir=$(pwd)
+    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+    mkdir -p $build_dir
+    cd $build_dir
+    cmake_arm $os $abi $lang
+    make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
+
+    prepare_emulator $port_armv8 $port_armv7
+
+    # just test the model on armv8
+    test_arm_model "test_cxx_api_lite" $port_armv8 "./third_party/install/mobilenet_v2_relu"
+
+    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    echo "Done"
 }

+# Build the code and run lite arm tests. This is executed in the CI system.
+function build_test_arm {
+    ########################################################################
+    # job 1-4 must be in one runner
+    port_armv8=5554
+    port_armv7=5556
+
+    build_test_arm_subtask_android
+    build_test_arm_subtask_armlinux
+}
+
+
 ############################# MAIN #################################
 function print_usage {
    echo -e "\nUSAGE:"
@@ -379,6 +433,18 @@ function main {
                build_test_arm
                shift
                ;;
+            build_test_arm_subtask_android)
+                build_test_arm_subtask_android
+                shift
+                ;;
+            build_test_arm_subtask_armlinux)
+                build_test_arm_subtask_armlinux
+                shift
+                ;;
+            build_test_arm_model1)
+                build_test_arm_subtask3_mobilenet_v2
+                shift
+                ;;
            check_style)
                check_style
                shift
@@ -397,4 +463,3 @@ function main {
 }

 main $@
-