make all ARMContext share the same DeviceInfo, and export SetRunMode to allow...

make all ARMContext share the same DeviceInfo, and export SetRunMode to allow users to set the thread num

make all ARMContext share the same DeviceInfo, and export SetRunMode to allow...
make all ARMContext share the same DeviceInfo, and export SetRunMode to allow users to set the thread num
ad333ac5 · Hong Ming · 6f705068 · 34491d6a · ad333ac5 · ad333ac5
9 changed file
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
  return counter.count() / 1000.0;
 }

-void Run(const char* model_dir, int repeat) {
+void Run(const char* model_dir, int repeat, int thread_num) {
 #ifdef LITE_WITH_ARM
  DeviceInfo::Init();
+  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
 #endif
  lite::ExecutorLite predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
 }  // namespace paddle

 int main(int argc, char** argv) {
-  CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
-  paddle::lite::Run(argv[1], std::stoi(argv[2]));
+  CHECK_EQ(argc, 4) << "usage: ./cmd <model_dir> <repeat> <thread_num>";
+  paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3]));

  return 0;
 }

--- a/paddle/fluid/lite/core/context.cc
+++ b/paddle/fluid/lite/core/context.cc
@@ -13,322 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/context.h"
-#include "paddle/fluid/lite/core/cpu_info.h"
-
-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
-#ifdef ARM_WITH_OMP
-#include <omp.h>
-#endif

 namespace paddle {
-namespace lite {
-
-#ifdef LITE_WITH_ARM
-
-void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int cpu_count = arm_get_cpucount();
-  dev.L1_cache_.resize(cpu_count);
-  dev.L2_cache_.resize(cpu_count);
-  dev.L3_cache_.resize(cpu_count);
-  for (int i = 0; i < cpu_count; ++i) {
-    dev.L1_cache_[i] = l1size;
-    dev.L2_cache_[i] = l2size;
-    dev.L3_cache_[i] = l3size;
-  }
-  workspace_.Resize({2 * (l1size + l2size)});
-}
-
-Context<TargetType::kARM>::Context() {
-  active_ids_ = {0};
-  mode_ = LITE_POWER_HIGH;
-  DeviceInfo& dev = DeviceInfo::Global();
-  workspace_.Resize(
-      {static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
-#ifdef TARGET_IOS
-  arch_ = APPLE;  // use 6x8
-#else
-  if (dev.big_core_ids_.size() > 0) {
-    arch_ = dev.archs_[dev.big_core_ids_[0]];
-  }
-#endif
-}
-
-PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
-
-int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
-
-Context<TargetType::kARM>::Context(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-}
-
-ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
-  mode_ = ctx.mode_;
-  active_ids_ = ctx.active_ids_;
-  workspace_ = ctx.workspace_;
-  arch_ = ctx.arch_;
-  count_ = ctx.count_;
-  return *this;
-}
-
-void Context<TargetType::kARM>::BindDev() {
-#ifdef ARM_WITH_OMP
-  int num_threads = active_ids_.size();
-  omp_set_num_threads(num_threads);
-#ifdef LITE_WITH_LINUX
-  std::vector<int> ssarets;
-  for (int j = 0; j < num_threads; ++j) {
-    ssarets.push_back(0);
-  }
-#pragma omp parallel for
-  for (int i = 0; i < num_threads; i++) {
-    ssarets[i] = set_sched_affinity(active_ids_);
-  }
-  for (int i = 0; i < num_threads; i++) {
-    if (ssarets[i] != 0) {
-      LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
-      return;
-    }
-  }
-#endif  // LITE_WITH_LINUX
-#else   // ARM_WITH_OMP
-#ifdef LITE_WITH_LINUX
-  std::vector<int> cpuid1;
-  cpuid1.push_back(active_ids_[0]);
-  int ssaret = set_sched_affinity(cpuid1);
-  if (ssaret != 0) {
-    printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
-    return;
-  }
-#endif  // LITE_WITH_LINUX
-#endif  // ARM_WITH_OMP
-}
-
-void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
-  DeviceInfo& dev = DeviceInfo::Global();
-  int big_core_size = dev.big_core_ids_.size();
-  int small_core_size = dev.little_core_ids_.size();
-  if (threads > big_core_size + small_core_size) {
-    threads = big_core_size + small_core_size;
-  }
-#ifdef ARM_WITH_OMP
-  count_++;
-  int shift_num = (count_ / 10) % big_core_size;
-  switch (mode) {
-    case LITE_POWER_FULL:
-      mode_ = mode;
-      active_ids_.clear();
-      for (int i = 0; i < threads; ++i) {
-        if (i < big_core_size) {
-          active_ids_.push_back(dev.big_core_ids_[i]);
-        } else {
-          active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_HIGH;
-        if (threads > big_core_size) {
-          LOG(ERROR) << "threads: " << threads
-                     << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_NO_BIND:
-      mode_ = LITE_POWER_NO_BIND;
-      active_ids_.clear();
-      if (threads > dev.core_ids_.size()) {
-        active_ids_.resize(dev.core_ids_.size());
-      } else {
-        active_ids_.resize(threads);
-      }
-      break;
-    case LITE_POWER_RAND_HIGH:
-      active_ids_.clear();
-      if (big_core_size > 0) {
-        mode_ = LITE_POWER_RAND_HIGH;
-        if (threads > big_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the big cores size: " << big_core_size;
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.big_core_ids_[(i + shift_num) % big_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_LOW;
-        LOG(WARNING)
-            << "HIGH POWER MODE is not support, switch to little cores";
-        if (threads > small_core_size) {
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.little_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-    case LITE_POWER_RAND_LOW:
-      active_ids_.clear();
-      if (small_core_size > 0) {
-        mode_ = LITE_POWER_RAND_LOW;
-        if (threads > small_core_size) {
-          LOG(WARNING) << "threads: " << threads
-                       << ", exceed the little cores size: " << small_core_size;
-          active_ids_ = dev.little_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(
-                dev.little_core_ids_[(i + shift_num) % small_core_size]);
-          }
-        }
-      } else {
-        mode_ = LITE_POWER_HIGH;
-        LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
-        if (threads > big_core_size) {
-          active_ids_ = dev.big_core_ids_;
-        } else {
-          for (int i = 0; i < threads; ++i) {
-            active_ids_.push_back(dev.big_core_ids_[i]);
-          }
-        }
-      }
-      if (active_ids_.size() == 0) {
-        active_ids_.push_back(0);
-      }
-      break;
-  }
-  //! fix multi-threads LITE_POWER_HIGH mode
-  if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
-    int threads = active_ids_.size();
-    omp_set_num_threads(threads);
-  } else {
-    if (check_online(active_ids_)) {
-      BindDev();
-    } else {
-      LOG(ERROR) << "core id " << active_ids_[0]
-                 << " is offline, switch to NO BIND MODE";
-      int threads = active_ids_.size();
-      omp_set_num_threads(threads);
-    }
-  }
-#else
-  if (big_core_size > 0) {
-    active_ids_ = {dev.big_core_ids_[0]};
-  } else {
-    active_ids_ = {0};
-  }
-#endif
-  //! alloc memory for sgemm in this context
-  int temp_mem_size =
-      DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
-  workspace_.Resize({temp_mem_size});
-  arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
-}
-
-ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
-
-void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
-
-int Context<TargetType::kARM>::l1_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L1_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l2_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L2_cache_[active_ids_[0]];
-}
-
-int Context<TargetType::kARM>::l3_cache_size() const {
-  DeviceInfo& dev = DeviceInfo::Global();
-  return dev.L3_cache_[active_ids_[0]];
-}
-
-bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
-  auto count = dims.product();
-  auto old = workspace_.dims();
-  if (count == old.product()) {
-    return false;
-  }
-
-  workspace_.Resize(
-      {static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
-  return true;
-}
-#endif  // LITE_WITH_ARM
-
-}  // namespace lite
+namespace lite {}  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/context.h
+++ b/paddle/fluid/lite/core/context.h
@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
 template <>
 class Context<TargetType::kARM> {
 public:
-  Context();
-  Context(PowerMode mode, int threads);
+  Context() {}
  explicit Context(const ARMContext& ctx);

-  ARMContext& operator=(const ARMContext& ctx);
+  ARMContext& operator=(const ARMContext& ctx) {}

  // NOTE: InitOnce should only be used by ContextScheduler
  void InitOnce() { DeviceInfo::Init(); }

  void CopyShared(const ARMContext* ctx) {}

-  void SetRunMode(PowerMode mode, int threads);
-  void SetCache(int l1size, int l2size, int l3size);
-  void SetArch(ARMArch arch);
-  void BindDev();
+  void SetRunMode(PowerMode mode, int threads) {
+    return DeviceInfo::Global().SetRunMode(mode, threads);
+  }
+  void SetCache(int l1size, int l2size, int l3size) {
+    return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
+  }
+  void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }

-  PowerMode mode() const;
-  int threads() const;
-  ARMArch arch() const;
+  PowerMode mode() const { return DeviceInfo::Global().mode(); }
+  int threads() const { return DeviceInfo::Global().threads(); }
+  ARMArch arch() const { return DeviceInfo::Global().arch(); }
+  int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
+  int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
+  int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }

  template <typename T>
  T* workspace_data() {
-    return workspace_.mutable_data<T>();
+    return DeviceInfo::Global().workspace_data<T>();
  }

-  int l1_cache_size() const;
-  int l2_cache_size() const;
-  int l3_cache_size() const;
-  bool ExtendWorkspace(DDimLite dims);
+  bool ExtendWorkspace(DDimLite dims) {
+    return DeviceInfo::Global().ExtendWorkspace(dims);
+  }

  std::string name() const { return "ARMContext"; }
-
- private:
-  // LITE_POWER_HIGH stands for using big cores,
-  // LITE_POWER_LOW stands for using small core,
-  // LITE_POWER_FULL stands for using all cores
-  ARMArch arch_;
-  PowerMode mode_;
-  std::vector<int> active_ids_;
-  TensorLite workspace_;
-  int64_t count_{0};
 };
 #endif


--- a/paddle/fluid/lite/core/cpu_info.cc
+++ b/paddle/fluid/lite/core/cpu_info.cc
--- a/paddle/fluid/lite/core/cpu_info.h
+++ b/paddle/fluid/lite/core/cpu_info.h
@@ -14,24 +14,12 @@

 #pragma once

+#include <cstdarg>
 #include <string>
 #include <vector>
+#include "paddle/fluid/lite/core/lite_tensor.h"
 #include "paddle/fluid/lite/utils/cp_logging.h"

-#ifdef LITE_WITH_LINUX
-#include <sys/syscall.h>
-#include <unistd.h>
-#endif
-
-#if __APPLE__
-#include "TargetConditionals.h"
-#if TARGET_OS_IPHONE
-#include <mach/machine.h>
-#include <sys/sysctl.h>
-#include <sys/types.h>
-#endif  // TARGET_OS_IPHONE
-#endif  // __APPLE__
-
 namespace paddle {
 namespace lite {

@@ -60,64 +48,73 @@ typedef enum {

 class DeviceInfo {
 public:
-  int idx_;
-  int max_freq_;
-  int min_freq_;
-  int generate_arch_;
-  int compute_core_num_;
-  int max_memory_;
-  int sharemem_size_;
-
-  std::string device_name_;
-  std::string compute_ability_;
-
-  std::vector<int> L1_cache_;
-  std::vector<int> L2_cache_;
-  std::vector<int> L3_cache_;
-  std::vector<int> core_ids_;
-  std::vector<int> big_core_ids_;
-  std::vector<int> little_core_ids_;
-  std::vector<int> cluster_ids_;
-  std::vector<ARMArch> archs_;
-
  static DeviceInfo& Global() {
    static auto* x = new DeviceInfo;
    return *x;
  }

-  static void Init() {
-    auto& info = Global();
-    InitInternal(&info);
+  static int Init() {
+    static int ret = Global().Setup();
+    return ret;
  }

- private:
-  DeviceInfo() = default;
-  static void InitInternal(DeviceInfo* dev);
-};
+  int Setup();

-size_t arm_get_meminfo();
+  void SetRunMode(PowerMode mode, int thread_num);
+  void SetCache(int l1size, int l2size, int l3size);
+  void SetArch(ARMArch arch) { arch_ = arch; }

-int arm_get_cpucount();
+  PowerMode mode() const { return mode_; }
+  int threads() const { return active_ids_.size(); }
+  ARMArch arch() const { return arch_; }
+  int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
+  int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
+  int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }

-void arm_get_cpu_arch(std::vector<ARMArch>* archs);
-
-bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
-
-#ifdef LITE_WITH_LINUX
-
-void set_default_cache(DeviceInfo* dev);
+  template <typename T>
+  T* workspace_data() {
+    return workspace_.mutable_data<T>();
+  }
+  bool ExtendWorkspace(DDimLite dims);

-std::string arm_get_cpu_name();
+ private:
+  int core_num_;
+  std::vector<int> max_freqs_;
+  std::vector<int> min_freqs_;
+  int mem_size_;
+  std::string dev_name_;

-int get_max_freq_khz(int cpuid);
+  std::vector<int> L1_cache_;
+  std::vector<int> L2_cache_;
+  std::vector<int> L3_cache_;
+  std::vector<int> core_ids_;
+  std::vector<int> big_core_ids_;
+  std::vector<int> little_core_ids_;
+  std::vector<int> cluster_ids_;
+  std::vector<ARMArch> archs_;

-int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
-                                    const std::vector<int>& cpu_freq,
-                                    std::vector<int>* cluster_ids);
-int check_online(const std::vector<int>& core_ids);
-int set_sched_affinity(const std::vector<int>& cpuids);
+  ARMArch arch_;
+  // LITE_POWER_HIGH stands for using big cores,
+  // LITE_POWER_LOW stands for using small core,
+  // LITE_POWER_FULL stands for using all cores
+  PowerMode mode_;
+  std::vector<int> active_ids_;
+  TensorLite workspace_;
+  int64_t count_{0};
+
+  void SetCacheInfo(int cache_id, int argc, ...);
+  void SetArchInfo(int argc, ...);
+  bool SetCPUInfoByName();
+  void SetCPUInfoByProb();
+  void RequestPowerFullMode(const int thread_num);
+  void RequestPowerHighMode(const int thread_num);
+  void RequestPowerLowMode(const int thread_num);
+  void RequestPowerNoBindMode(const int thread_num);
+  void RequestPowerRandHighMode(const int shift_num, const int thread_num);
+  void RequestPowerRandLowMode(const int shift_num, const int thread_num);

-#endif  // LITE_WITH_LINUX
+  DeviceInfo() = default;
+};

 #endif  // LITE_WITH_ARM


--- a/paddle/fluid/lite/kernels/arm/conv_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute.cc
@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
  auto o_dims = param.output->dims();

  auto& ctx = this->ctx_->template As<ARMContext>();
-  // TODO(xxx): make api and expose it
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);

  int win = x_dims[3];  // nchw
  int hin = x_dims[2];

--- a/paddle/fluid/lite/kernels/arm/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fc_compute.cc
@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
  auto w_dims = param.w->dims();

  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);

  CHECK_GE(x_dims.size(), 2UL);
  CHECK_EQ(w_dims.size(), 2UL);

--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -24,7 +24,6 @@ namespace arm {

 void MulCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }

 void MulCompute::Run() {

--- a/paddle/fluid/lite/kernels/arm/pool_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/pool_compute.cc
@@ -26,7 +26,6 @@ namespace arm {

 void PoolCompute::PrepareForRun() {
  auto& ctx = this->ctx_->template As<ARMContext>();
-  ctx.SetRunMode(LITE_POWER_HIGH, 4);
 }

 void PoolCompute::Run() {