Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into xzl/incubate/lite

58bf3c48 · nhzlx · 7e2ecbd6 · 758db8df · 58bf3c48 · 58bf3c48
15 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,6 +2,20 @@ before_script:
  - env
  - export CI_USER_DIR=$(pwd)
+  # prepare ccache
+  - apt install ccache
+  # for proxy
+  - export http_proxy=$CI_PROXY
+  - export https_proxy=$CI_PROXY
+  # merge the latest code
+  - git config --global user.email "you@example.com"
+  - git config --global user.name "Your Name"
+  - git fetch origin incubate/lite
+  - git merge --no-ff origin/incubate/lite
 image: $SERVER_LITE_DOCKER_IMAGE
 stages:
@@ -14,19 +28,13 @@ check:prebuilt:
        - lite
    stage: ci
    script:
+        # prepare for pre-commit
        - rm -rf ~/.pip
-        - export http_proxy=$CI_PROXY
-        - export https_proxy=$CI_PROXY
        - pip install pre-commit
        - pre-commit install
-        # merge the latest code
-        - git config --global user.email "you@example.com"
-        - git config --global user.name "Your Name"
-        - git fetch origin incubate/lite
-        - git merge --no-ff origin/incubate/lite
        - ./paddle/fluid/lite/tools/build.sh check_style
    cache:
        key: check_style
        paths:
@@ -42,17 +50,11 @@ build:server:
        paths:
            - build/third_party
            - ~/.ccache
+            - $CI_PROJECT_DIR/_build_server_ccache
    script:
-        - apt install ccache
+        # customize ccache path for specifying runner cache
-        - export http_proxy=$CI_PROXY
+        - export CCACHE_DIR=$CI_PROJECT_DIR/_build_server_ccache
-        - export https_proxy=$CI_PROXY
+        # run build and test
-        # merge the latest code
-        - git config --global user.email "you@example.com"
-        - git config --global user.name "Your Name"
-        - git fetch origin incubate/lite
-        - git merge --no-ff origin/incubate/lite
        - mkdir -p build
        - cd build
        - ../paddle/fluid/lite/tools/build.sh cmake_x86
@@ -66,7 +68,27 @@ build:server:
    dependencies:
        - check:prebuilt
-build:mobile:
+build:mobile_android:
+    tags:
+        - lite
+    stage: build_mobile
+    image: $MOBILE_LITE_DOCKER_IMAGE
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+            - $CI_PROJECT_DIR/build_mobile_ccache
+    script:
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_android
+    dependencies:
+        - build:server
+build:mobile_armlinux:
    tags:
        - lite
    stage: build_mobile
@@ -77,17 +99,43 @@ build:mobile:
            - $MOBILE_LITE_CACHE0
            - $MOBILE_LITE_CACHE1
            - ~/.ccache
+            - $CI_PROJECT_DIR/build_mobile_ccache2
    script:
-        - apt install ccache
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_ccache2
-        - export http_proxy=$CI_PROXY
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_subtask_armlinux
-        - export https_proxy=$CI_PROXY
+    dependencies:
+        - build:server
-        # merge the latest code
+    cache:
-        - git config --global user.email "you@example.com"
+        key: mobile_thirdparty
-        - git config --global user.name "Your Name"
+        paths:
-        - git fetch origin incubate/lite
+            - $MOBILE_LITE_CACHE0
-        - git merge --no-ff origin/incubate/lite
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+build:mobile_model_mobilenetv2:
+    tags:
+        - lite
+    stage: build_mobile
+    image: $MOBILE_LITE_DOCKER_IMAGE
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+    script:
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model1
-        - ./paddle/fluid/lite/tools/build.sh build_test_arm
    dependencies:
        - build:server
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+            - $CI_PROJECT_DIR/build_mobile_model1
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
  return counter.count() / 1000.0;
 }
-void Run(const char* model_dir, int repeat) {
+void Run(const char* model_dir, int repeat, int thread_num) {
 #ifdef LITE_WITH_ARM
  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
 #endif
  lite::ExecutorLite predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
 }  // namespace paddle
 int main(int argc, char** argv) {
-  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
+  CHECK_EQ(argc, 4) << "usage: ./cmd <model_dir> <repeat> <thread_num>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3]));
  return 0;
 }

--- a/paddle/fluid/lite/core/context.cc
+++ b/paddle/fluid/lite/core/context.cc
@@ -13,322 +13,7 @@
 // limitations under the License.
 #include "paddle/fluid/lite/core/context.h"
-#include "paddle/fluid/lite/core/cpu_info.h"
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif
 namespace paddle {
-namespace lite {
+namespace lite {}  // namespace lite
-#ifdef LITE_WITH_ARM
-void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int cpu_count = arm_get_cpucount();
-  dev.L1_cache_.resize(cpu_count);
-  dev.L2_cache_.resize(cpu_count);
-  dev.L3_cache_.resize(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    dev.L1_cache_[i] = l1size;
-    dev.L2_cache_[i] = l2size;
-    dev.L3_cache_[i] = l3size;
-  }
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-Context<TargetType::kARM>::Context() {
-  active_ids_ = {0};
-  mode_ = LITE_POWER_HIGH;
-  DeviceInfo& dev = DeviceInfo::Global();
-  workspace_.Resize(
-      {static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
-#ifdef TARGET_IOS
-  arch_ = APPLE;  // use 6x8
-#else
-  if (dev.big_core_ids_.size() > 0) {
-    arch_ = dev.archs_[dev.big_core_ids_[0]];
-  }
-#endif
-}
-PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
-int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
-Context<TargetType::kARM>::Context(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-}
-ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-  return *this;
-}
-void Context<TargetType::kARM>::BindDev() {
-#ifdef ARM_WITH_OMP
-  int num_threads = active_ids_.size();
-  omp_set_num_threads(num_threads);
-#ifdef LITE_WITH_LINUX
-  std::vector<int> ssarets;
-  for (int j = 0; j < num_threads; ++j) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(active_ids_);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
-      return;
-    }
-  }
-#endif  // LITE_WITH_LINUX
-#else   // ARM_WITH_OMP
-#ifdef LITE_WITH_LINUX
-  std::vector<int> cpuid1;
-  cpuid1.push_back(active_ids_[0]);
-  int ssaret = set_sched_affinity(cpuid1);
-  if (ssaret != 0) {
-    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
-    return;
-  }
-#endif  // LITE_WITH_LINUX
-#endif  // ARM_WITH_OMP
-}
-void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int big_core_size = dev.big_core_ids_.size();
-  int small_core_size = dev.little_core_ids_.size();
-  if (threads > big_core_size + small_core_size) {
-    threads = big_core_size + small_core_size;
-  }
-#ifdef ARM_WITH_OMP
-  count_++;
-  int shift_num = (count_ / 10) % big_core_size;
-  switch (mode) {
-    case LITE_POWER_FULL:
-      mode_ = mode;
-      active_ids_.clear();
-      for (int i = 0; i < threads; ++i) {
-        if (i < big_core_size) {
-          active_ids_.push_back(dev.big_core_ids_[i]);
-        } else {
-          active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_HIGH;
-        if (threads > big_core_size) {
-          LOG(ERROR) << "threads: " << threads
-                     << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_NO_BIND:
-      mode_ = LITE_POWER_NO_BIND;
-      active_ids_.clear();
-      if (threads > dev.core_ids_.size()) {
-        active_ids_.resize(dev.core_ids_.size());
-      } else {
-        active_ids_.resize(threads);
-      }
-      break;
-    case LITE_POWER_RAND_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_RAND_HIGH;
-        if (threads > big_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.big_core_ids_[(i + shift_num) % big_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(WARNING)
-            << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_RAND_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_RAND_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.little_core_ids_[(i + shift_num) % small_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-  }
-  //! fix multi-threads LITE_POWER_HIGH mode
-  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
-    int threads = active_ids_.size();
-    omp_set_num_threads(threads);
-  } else {
-    if (check_online(active_ids_)) {
-      BindDev();
-    } else {
-      LOG(ERROR) << "core id " << active_ids_[0]
-                 << " is offline, switch to NO BIND MODE";
-      int threads = active_ids_.size();
-      omp_set_num_threads(threads);
-    }
-  }
-#else
-  if (big_core_size > 0) {
-    active_ids_ = {dev.big_core_ids_[0]};
-  } else {
-    active_ids_ = {0};
-  }
-#endif
-  //! alloc memory for sgemm in this context
-  int temp_mem_size =
-      DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
-  workspace_.Resize({temp_mem_size});
-  arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
-}
-ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
-void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
-int Context<TargetType::kARM>::l1_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L1_cache_[active_ids_[0]];
-}
-int Context<TargetType::kARM>::l2_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L2_cache_[active_ids_[0]];
-}
-int Context<TargetType::kARM>::l3_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L3_cache_[active_ids_[0]];
-}
-bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
-  auto count = dims.product();
-  auto old = workspace_.dims();
-  if (count == old.product()) {
-    return false;
-  }
-  workspace_.Resize(
-      {static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
-  return true;
-}
-#endif  // LITE_WITH_ARM
-}  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
 template <>
 class Context<TargetType::kARM> {
 public:
-  Context();
+  Context() {}
-  Context(PowerMode mode, int threads);
  explicit Context(const ARMContext& ctx);
-  ARMContext& operator=(const ARMContext& ctx);
+  ARMContext& operator=(const ARMContext& ctx) {}
  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() { DeviceInfo::Init(); }
  void CopyShared(const ARMContext* ctx) {}
-  void SetRunMode(PowerMode mode, int threads);
+  void SetRunMode(PowerMode mode, int threads) {
-  void SetCache(int l1size, int l2size, int l3size);
+    return DeviceInfo::Global().SetRunMode(mode, threads);
-  void SetArch(ARMArch arch);
+  }
-  void BindDev();
+  void SetCache(int l1size, int l2size, int l3size) {
+    return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
+  }
+  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
-  PowerMode mode() const;
+  PowerMode mode() const { return DeviceInfo::Global().mode(); }
-  int threads() const;
+  int threads() const { return DeviceInfo::Global().threads(); }
-  ARMArch arch() const;
+  ARMArch arch() const { return DeviceInfo::Global().arch(); }
+  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
+  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
+  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
  template <typename T>
  T* workspace_data() {
-    return workspace_.mutable_data<T>();
+    return DeviceInfo::Global().workspace_data<T>();
  }
-  int l1_cache_size() const;
+  bool ExtendWorkspace(DDimLite dims) {
-  int l2_cache_size() const;
+    return DeviceInfo::Global().ExtendWorkspace(dims);
-  int l3_cache_size() const;
+  }
-  bool ExtendWorkspace(DDimLite dims);
  std::string name() const { return "ARMContext"; }
- private:
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  ARMArch arch_;
-  PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
 };
 #endif

--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
--- a/paddle/fluid/lite/core/cpu_info.h
+++ b/paddle/fluid/lite/core/cpu_info.h
@@ -14,24 +14,12 @@
 #pragma once
+#include <cstdarg>
 #include <string>
 #include <vector>
+#include "paddle/fluid/lite/core/lite_tensor.h"
 #include "paddle/fluid/lite/utils/cp_logging.h"
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
 namespace paddle {
 namespace lite {
@@ -60,64 +48,73 @@ typedef enum {
 class DeviceInfo {
 public:
-  int idx_;
-  int max_freq_;
-  int min_freq_;
-  int generate_arch_;
-  int compute_core_num_;
-  int max_memory_;
-  int sharemem_size_;
-  std::string device_name_;
-  std::string compute_ability_;
-  std::vector<int> L1_cache_;
-  std::vector<int> L2_cache_;
-  std::vector<int> L3_cache_;
-  std::vector<int> core_ids_;
-  std::vector<int> big_core_ids_;
-  std::vector<int> little_core_ids_;
-  std::vector<int> cluster_ids_;
-  std::vector<ARMArch> archs_;
  static DeviceInfo& Global() {
    static auto* x = new DeviceInfo;
    return *x;
  }
-  static void Init() {
+  static int Init() {
-    auto& info = Global();
+    static int ret = Global().Setup();
-    InitInternal(&info);
+    return ret;
  }
- private:
+  int Setup();
-  DeviceInfo() = default;
-  static void InitInternal(DeviceInfo* dev);
-};
-size_t arm_get_meminfo();
+  void SetRunMode(PowerMode mode, int thread_num);
+  void SetCache(int l1size, int l2size, int l3size);
+  void SetArch(ARMArch arch) { arch_ = arch; }
-int arm_get_cpucount();
+  PowerMode mode() const { return mode_; }
+  int threads() const { return active_ids_.size(); }
+  ARMArch arch() const { return arch_; }
+  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
+  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
+  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
-void arm_get_cpu_arch(std::vector<ARMArch>* archs);
+  template <typename T>
+  T* workspace_data() {
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
+    return workspace_.mutable_data<T>();
+  }
-#ifdef LITE_WITH_LINUX
+  bool ExtendWorkspace(DDimLite dims);
-void set_default_cache(DeviceInfo* dev);
-std::string arm_get_cpu_name();
+ private:
+  int core_num_;
+  std::vector<int> max_freqs_;
+  std::vector<int> min_freqs_;
+  int mem_size_;
+  std::string dev_name_;
-int get_max_freq_khz(int cpuid);
+  std::vector<int> L1_cache_;
+  std::vector<int> L2_cache_;
+  std::vector<int> L3_cache_;
+  std::vector<int> core_ids_;
+  std::vector<int> big_core_ids_;
+  std::vector<int> little_core_ids_;
+  std::vector<int> cluster_ids_;
+  std::vector<ARMArch> archs_;
-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
+  ARMArch arch_;
-                                    const std::vector<int>& cpu_freq,
+  // LITE_POWER_HIGH stands for using big cores,
-                                    std::vector<int>* cluster_ids);
+  // LITE_POWER_LOW stands for using small core,
-int check_online(const std::vector<int>& core_ids);
+  // LITE_POWER_FULL stands for using all cores
-int set_sched_affinity(const std::vector<int>& cpuids);
+  PowerMode mode_;
+  std::vector<int> active_ids_;
+  TensorLite workspace_;
+  int64_t count_{0};
+  void SetCacheInfo(int cache_id, int argc, ...);
+  void SetArchInfo(int argc, ...);
+  bool SetCPUInfoByName();
+  void SetCPUInfoByProb();
+  void RequestPowerFullMode(const int thread_num);
+  void RequestPowerHighMode(const int thread_num);
+  void RequestPowerLowMode(const int thread_num);
+  void RequestPowerNoBindMode(const int thread_num);
+  void RequestPowerRandHighMode(const int shift_num, const int thread_num);
+  void RequestPowerRandLowMode(const int shift_num, const int thread_num);
-#endif  // LITE_WITH_LINUX
+  DeviceInfo() = default;
+};
 #endif  // LITE_WITH_ARM

--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
  auto o_dims = param.output->dims();
  auto& ctx = this->ctx_->template As<ARMContext>();
-  // TODO(xxx): make api and expose it
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
  int win = x_dims[3];  // nchw
  int hin = x_dims[2];

--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
  auto w_dims = param.w->dims();
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
  CHECK_GE(x_dims.size(), 2UL);
  CHECK_EQ(w_dims.size(), 2UL);

--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -24,7 +24,6 @@ namespace arm {
 void MulCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }
 void MulCompute::Run() {

--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
@@ -26,7 +26,6 @@ namespace arm {
 void PoolCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }
 void PoolCompute::Run() {

--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -17,6 +17,7 @@ cc_library(dropout_compute_x86 SRCS dropout_compute.cc DEPS ${lite_kernel_deps}
 cc_library(concat_compute_x86 SRCS concat_compute.cc DEPS ${lite_kernel_deps} )
 cc_library(conv_compute_x86 SRCS conv_compute.cc DEPS ${lite_kernel_deps} blas im2col vol2col)
 cc_library(pool_compute_x86 SRCS pool_compute.cc DEPS ${lite_kernel_deps} pooling)
+cc_library(batch_norm_compute_x86 SRCS batch_norm_compute.cc DEPS ${lite_kernel_deps})
 lite_cc_test(test_fc_compute_x86 SRCS fc_compute_test.cc DEPS fc_compute_x86)
 lite_cc_test(test_conv2d_compute_x86 SRCS conv_compute_test.cc DEPS conv_compute_x86)
@@ -28,6 +29,7 @@ lite_cc_test(test_relu_compute_x86 SRCS relu_compute_test.cc DEPS relu_compute_x
 lite_cc_test(test_mul_compute_x86 SRCS mul_compute_test.cc DEPS mul_compute_x86 operator)
 lite_cc_test(test_scale_compute_x86 SRCS scale_compute_test.cc DEPS scale_compute_x86)
 lite_cc_test(test_dropout_compute_x86 SRCS dropout_compute_test.cc DEPS dropout_compute_x86)
+lite_cc_test(test_batch_norm_compute_x86 SRCS batch_norm_compute_test.cc DEPS batch_norm_compute_x86)
 set(x86_kernels
@@ -44,6 +46,7 @@ set(x86_kernels
    concat_compute_x86
    conv_compute_x86 
    pool_compute_x86  
+    batch_norm_compute_x86 
    )
 set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")

--- a/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
+REGISTER_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW,
+                     paddle::lite::kernels::x86::BatchNormCompute<float>, def)
+    .BindInput("X", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Scale", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Mean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindInput("Variance", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("Y", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("VarianceOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("MeanOut", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("SavedMean", {LiteType::GetTensorTy(TARGET(kX86))})
+    .BindOutput("SavedVariance", {LiteType::GetTensorTy(TARGET(kX86))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/x86/batch_norm_compute.h
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <random>
+#include <string>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+template <typename T>
+using EigenArrayMap =
+    Eigen::Map<Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using ConstEigenArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, Eigen::Dynamic>>;
+template <typename T>
+using EigenVectorArrayMap = Eigen::Map<Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+using ConstEigenVectorArrayMap =
+    Eigen::Map<const Eigen::Array<T, Eigen::Dynamic, 1>>;
+template <typename T>
+class BatchNormCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
+ public:
+  using param_t = operators::BatchNormParam;
+  void Run() override {
+    auto &param = *param_.get_mutable<operators::BatchNormParam>();
+    bool global_stats = param.is_test || param.use_global_stats;
+    const auto *x = param.x;
+    const auto &x_dims = x->dims();
+    CHECK(x_dims.size() >= 2 && x_dims.size() <= 5);
+    const int N = x_dims[0];
+    const int C = param.data_layout == DATALAYOUT(kNCHW)
+                      ? x_dims[1]
+                      : x_dims[x_dims.size() - 1];
+    const int sample_size = x->dims().production() / N / C;
+    // alloc memory
+    param.y->template mutable_data<T>();
+    param.mean_out->template mutable_data<T>();
+    param.variance_out->template mutable_data<T>();
+    param.saved_mean->template mutable_data<T>();
+    param.saved_variance->template mutable_data<T>();
+    if (!global_stats) {
+      // saved_xx is use just in this batch of data
+      EigenVectorArrayMap<T> saved_mean_e(param.saved_mean->mutable_data<T>(),
+                                          C);
+      EigenVectorArrayMap<T> saved_variance_e(
+          param.saved_variance->mutable_data<T>(), C);
+      saved_mean_e.setZero();
+      saved_variance_e.setZero();
+      EigenVectorArrayMap<T> running_mean_arr(param.mean_out->mutable_data<T>(),
+                                              C);
+      EigenVectorArrayMap<T> running_var_arr(
+          param.variance_out->mutable_data<T>(), C);
+      if ((N * sample_size) == 1) {
+        LOG(WARNING) << "Only 1 element in normalization dimension, "
+                     << "we skip the batch norm calculation, let y = x.";
+        framework::TensorCopy(x->raw_tensor(), platform::CPUPlace(),
+                              &param.y->raw_tensor());
+        return;
+      }
+      switch (param.data_layout) {
+        case DATALAYOUT(kNCHW): {
+          ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_mean_e(nc % C) += x_arr.col(nc).sum();
+          }
+          saved_mean_e /= N * sample_size;
+          for (int nc = 0; nc < N * C; ++nc) {
+            saved_variance_e(nc % C) +=
+                (x_arr.col(nc) - saved_mean_e(nc % C)).matrix().squaredNorm();
+          }
+          saved_variance_e /= N * sample_size;
+          break;
+        }
+        default:
+          LOG(FATAL) << "Unknown storage order: "
+                     << DataLayoutToStr(param.data_layout);
+          break;
+      }
+      running_mean_arr = running_mean_arr * param.momentum +
+                         saved_mean_e * (1. - param.momentum);
+      running_var_arr = running_var_arr * param.momentum +
+                        saved_variance_e * (1. - param.momentum);
+    }
+    // use SavedMean and SavedVariance to do normalize
+    Eigen::Array<T, Eigen::Dynamic, 1> inv_std(C);
+    if (global_stats) {
+      ConstEigenVectorArrayMap<T> var_arr(param.variance->data<T>(), C);
+      inv_std = (var_arr + param.epsilon).sqrt().inverse();
+    } else {
+      EigenVectorArrayMap<T> saved_inv_std(
+          param.saved_variance->mutable_data<T>(), C);
+      // inverse SavedVariance first, gradient will use it too.
+      saved_inv_std = (saved_inv_std + param.epsilon).inverse().sqrt();
+      inv_std = saved_inv_std;
+    }
+    ConstEigenVectorArrayMap<T> mean_arr(
+        global_stats ? param.mean->data<T>() : param.saved_mean->data<T>(), C);
+    //   ((x - est_mean) * (inv_var) * scale + bias
+    //   formula transform ====>
+    //   (x * inv_var * scale) + (bias - est_mean * inv_var * scale)
+    ConstEigenVectorArrayMap<T> scale_arr(param.scale->data<T>(), C);
+    ConstEigenVectorArrayMap<T> bias_arr(param.bias->data<T>(), C);
+    Eigen::Array<T, Eigen::Dynamic, 1> new_scale = inv_std * scale_arr;
+    Eigen::Array<T, Eigen::Dynamic, 1> new_bias =
+        bias_arr - mean_arr * inv_std * scale_arr;
+    switch (param.data_layout) {
+      case DATALAYOUT(kNCHW): {
+        EigenArrayMap<T> y_arr(param.y->mutable_data<T>(), sample_size, N * C);
+        ConstEigenArrayMap<T> x_arr(x->data<T>(), sample_size, N * C);
+        for (int nc = 0; nc < N * C; ++nc) {
+          y_arr.col(nc) = x_arr.col(nc) * new_scale(nc % C) + new_bias(nc % C);
+        }
+        break;
+      }
+      default:
+        LOG(FATAL) << "Unknown storage order: "
+                   << DataLayoutToStr(param.data_layout);
+        break;
+    }
+  }
+  virtual ~BatchNormCompute() = default;
+};
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/batch_norm_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/x86/batch_norm_compute.h"
+#include <gtest/gtest.h>
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace x86 {
+TEST(batch_norm_x86, retrive_op) {
+  auto batch_norm =
+      KernelRegistry::Global().Create<TARGET(kX86), PRECISION(kFloat)>(
+          "batch_norm");
+  ASSERT_FALSE(batch_norm.empty());
+  ASSERT_TRUE(batch_norm.front());
+}
+TEST(batch_norm_x86, init) {
+  BatchNormCompute<float> batch_norm;
+  ASSERT_EQ(batch_norm.precision(), PRECISION(kFloat));
+  ASSERT_EQ(batch_norm.target(), TARGET(kX86));
+}
+TEST(batch_norm_x86, run_test) {
+  lite::Tensor x, scale, bias, mean, variance, y, mean_out, variance_out,
+      saved_mean, saved_variance;
+  constexpr int batch_size = 2;
+  std::vector<int64_t> x_shape{batch_size, 3, 64, 64};
+  x.Resize(lite::DDim(x_shape));
+  std::vector<int64_t> scale_shape{3};
+  scale.Resize(lite::DDim(scale_shape));
+  std::vector<int64_t> bias_shape{3};
+  bias.Resize(lite::DDim(bias_shape));
+  std::vector<int64_t> mean_shape{3};
+  mean.Resize(lite::DDim(mean_shape));
+  std::vector<int64_t> variance_shape{3};
+  variance.Resize(lite::DDim(variance_shape));
+  std::vector<int64_t> y_shape{batch_size, 3, 64, 64};
+  y.Resize(lite::DDim(y_shape));
+  std::vector<int64_t> mean_out_shape{3};
+  mean_out.Resize(lite::DDim(mean_out_shape));
+  std::vector<int64_t> variance_out_shape{3};
+  variance_out.Resize(lite::DDim(variance_out_shape));
+  std::vector<int64_t> saved_mean_shape{3};
+  saved_mean.Resize(lite::DDim(saved_mean_shape));
+  std::vector<int64_t> saved_variance_shape{3};
+  saved_variance.Resize(lite::DDim(saved_variance_shape));
+  auto x_data = x.mutable_data<float>();
+  auto scale_data = scale.mutable_data<float>();
+  auto bias_data = bias.mutable_data<float>();
+  auto mean_data = mean.mutable_data<float>();
+  auto variance_data = variance.mutable_data<float>();
+  y.mutable_data<float>();
+  mean_out.mutable_data<float>();
+  variance_out.mutable_data<float>();
+  saved_mean.mutable_data<float>();
+  saved_variance.mutable_data<float>();
+  for (int64_t i = 0; i < x.dims().production(); i++) {
+    x_data[i] = static_cast<float>(i);
+  }
+  for (int i = 0; i < scale.dims().production(); i++) {
+    scale_data[i] = static_cast<float>(i) * 0.01f + 0.03f;
+  }
+  for (int i = 0; i < bias.dims().production(); i++) {
+    bias_data[i] = static_cast<float>(i) * 0.065f + 0.1f;
+  }
+  for (int i = 0; i < mean.dims().production(); i++) {
+    mean_data[i] = static_cast<float>(i) * 0.0565f;
+  }
+  for (int i = 0; i < variance.dims().production(); i++) {
+    variance_data[i] = static_cast<float>(i) * 2.08f + 1.5f;
+  }
+  // BatchNormCompute batch_norm;
+  BatchNormCompute<float> batch_norm;
+  operators::BatchNormParam param;
+  param.x = &x;
+  param.is_test = false;
+  param.scale = &scale;
+  param.bias = &bias;
+  param.mean = &mean;
+  param.variance = &variance;
+  param.use_global_stats = false;
+  param.epsilon = 1e-4f;
+  param.momentum = 0.9f;
+  param.y = &y;
+  param.mean_out = &mean_out;
+  param.variance_out = &variance_out;
+  param.saved_mean = &saved_mean;
+  param.saved_variance = &saved_variance;
+  batch_norm.SetParam(param);
+  batch_norm.Run();
+  LOG(INFO) << "output: " << y;
+  LOG(INFO) << "mean_out: " << mean_out;
+  LOG(INFO) << "variance_out: " << mean_out;
+  LOG(INFO) << "saved_mean: " << saved_mean;
+  LOG(INFO) << "saved_variance: " << saved_variance;
+  /*for (int i = 0; i < y.dims().production(); i++) {
+    if(i < 5 || i > y.dims().production() - 5)
+      LOG(INFO) << y_data[i];
+  }*/
+}
+}  // namespace x86
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(batch_norm, kX86, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -135,8 +135,8 @@ function test_arm_model {
    adb -s emulator-${port} push ${model_dir} ${adb_work_dir}
    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
-    local adb_model_path="./${adb_work_dir}/`basename ${model_dir}`"
+    local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "./${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
+    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
 }
@@ -225,16 +225,11 @@ function test_arm {
    for _test in $(cat $TESTS_FILE); do
        test_arm_android $_test $port
    done
-    # TODO(sangoly): refine this
-    test_arm_model "test_cxx_api_lite" $port "./third_party/install/mobilenet_v2_relu"
 }
-# Build the code and run lite arm tests. This is executed in the CI system.
+function prepare_emulator {
-function build_test_arm {
+    local port_armv8=$1
-    ########################################################################
+    local port_armv7=$2
-    # job 1-4 must be in one runner
-    port_armv8=5554
-    port_armv7=5556
    adb kill-server
    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
@@ -245,6 +240,18 @@ function build_test_arm {
    echo n | avdmanager create avd -f -n paddle-armv7 -k "system-images;android-24;google_apis;armeabi-v7a"
    echo -ne '\n' | ${ANDROID_HOME}/emulator/emulator -avd paddle-armv7 -noaudio -no-window -gpu off -verbose -port ${port_armv7} &
    sleep 1m
+}
+# We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
+# sub-task1
+function build_test_arm_subtask_android {
+    ########################################################################
+    # job 1-4 must be in one runner
+    port_armv8=5554
+    port_armv7=5556
+    prepare_emulator $port_armv8 $port_armv7
    # job 1
    build_arm "android" "armv8" "gcc"
@@ -252,9 +259,9 @@ function build_test_arm {
    cd -
    # job 2
-    build_arm "android" "armv8" "clang"
+    #build_arm "android" "armv8" "clang"
-    test_arm "android" "armv8" "clang" ${port_armv8}
+    #test_arm "android" "armv8" "clang" ${port_armv8}
-    cd -
+    #cd -
    # job 3
    build_arm "android" "armv7" "gcc"
@@ -262,13 +269,22 @@ function build_test_arm {
    cd -
    # job 4
-    build_arm "android" "armv7" "clang"
+    #build_arm "android" "armv7" "clang"
-    test_arm "android" "armv7" "clang" ${port_armv7}
+    #test_arm "android" "armv7" "clang" ${port_armv7}
-    cd -
+    #cd -
    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    echo "Done"
+}
+# sub-task2
+function build_test_arm_subtask_armlinux {
    ########################################################################
+    # job 1-4 must be in one runner
+    port_armv8=5554
+    port_armv7=5556
+    prepare_emulator $port_armv8 $port_armv7
    # job 5
    build_arm "armlinux" "armv8"
@@ -285,9 +301,47 @@ function build_test_arm {
    test_arm "armlinux" "armv7hf"
    cd -
+    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
+    echo "Done"
+}
+# sub-task3
+function build_test_arm_subtask3_mobilenet_v2 {
+    local port_armv8=5554
+    local port_armv7=5556
+    # We just test following single one environment to limit the CI time.
+    local os=android
+    local abi=armv8
+    local lang=gcc
+    cur_dir=$(pwd)
+    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
+    mkdir -p $build_dir
+    cd $build_dir
+    cmake_arm $os $abi $lang
+    make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
+    prepare_emulator $port_armv8 $port_armv7
+    # just test the model on armv8
+    test_arm_model "test_cxx_api_lite" $port_armv8 "./third_party/install/mobilenet_v2_relu"
+    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    echo "Done"
 }
+# Build the code and run lite arm tests. This is executed in the CI system.
+function build_test_arm {
+    ########################################################################
+    # job 1-4 must be in one runner
+    port_armv8=5554
+    port_armv7=5556
+    build_test_arm_subtask_android
+    build_test_arm_subtask_armlinux
+}
 ############################# MAIN #################################
 function print_usage {
    echo -e "\nUSAGE:"
@@ -379,6 +433,18 @@ function main {
                build_test_arm
                shift
                ;;
+            build_test_arm_subtask_android)
+                build_test_arm_subtask_android
+                shift
+                ;;
+            build_test_arm_subtask_armlinux)
+                build_test_arm_subtask_armlinux
+                shift
+                ;;
+            build_test_arm_model1)
+                build_test_arm_subtask3_mobilenet_v2
+                shift
+                ;;
            check_style)
                check_style
                shift
@@ -397,4 +463,3 @@ function main {
 }
 main $@