提交 ce46ef22 编写于 作者: H hong19860320

ARM cpu_info refine

test=develop
上级 251255bc
...@@ -67,7 +67,7 @@ class Context<TargetType::kARM> { ...@@ -67,7 +67,7 @@ class Context<TargetType::kARM> {
ARMContext& operator=(const ARMContext& ctx) {} ARMContext& operator=(const ARMContext& ctx) {}
// NOTE: InitOnce should only be used by ContextScheduler // NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {} void InitOnce() { DeviceInfo::Init(); }
void CopyShared(const ARMContext* ctx) {} void CopyShared(const ARMContext* ctx) {}
...@@ -78,20 +78,19 @@ class Context<TargetType::kARM> { ...@@ -78,20 +78,19 @@ class Context<TargetType::kARM> {
return DeviceInfo::Global().SetCache(l1size, l2size, l3size); return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
} }
void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); } void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
void BindDev() { return DeviceInfo::Global().BindDev(); }
PowerMode mode() const { return DeviceInfo::Global().mode(); } PowerMode mode() const { return DeviceInfo::Global().mode(); }
int threads() const { return DeviceInfo::Global().threads(); } int threads() const { return DeviceInfo::Global().threads(); }
ARMArch arch() const { return DeviceInfo::Global().arch(); } ARMArch arch() const { return DeviceInfo::Global().arch(); }
int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
template <typename T> template <typename T>
T* workspace_data() { T* workspace_data() {
return DeviceInfo::Global().workspace_data<T>(); return DeviceInfo::Global().workspace_data<T>();
} }
int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
bool ExtendWorkspace(DDimLite dims) { bool ExtendWorkspace(DDimLite dims) {
return DeviceInfo::Global().ExtendWorkspace(dims); return DeviceInfo::Global().ExtendWorkspace(dims);
} }
......
...@@ -29,7 +29,8 @@ ...@@ -29,7 +29,8 @@
#include <omp.h> #include <omp.h>
#endif #endif
#include <cstdarg> #include <algorithm>
#include <limits>
#include "paddle/fluid/lite/core/cpu_info.h" #include "paddle/fluid/lite/core/cpu_info.h"
namespace paddle { namespace paddle {
...@@ -37,853 +38,920 @@ namespace lite { ...@@ -37,853 +38,920 @@ namespace lite {
#ifdef LITE_WITH_ARM #ifdef LITE_WITH_ARM
void DeviceInfo::InitInternal(DeviceInfo* dev) { #ifdef TARGET_IOS
set_default_cache(dev); const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
dev->compute_core_num_ = arm_get_cpucount(); const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
dev->max_memory_ = arm_get_meminfo(); const int DEFAULT_L3_CACHE_SIZE = 0;
#else
const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
const int DEFAULT_L3_CACHE_SIZE = 0;
#endif
// get max freq int get_cpu_num() {
#ifdef LITE_WITH_LINUX #ifdef LITE_WITH_LINUX
std::vector<int> max_freq(dev->compute_core_num_); // get cpu count from /sys/devices/system/cpu/cpunum/uevent
for (int i = 0; i < dev->compute_core_num_; ++i) { int max_cpu_num = 20;
max_freq[i] = get_max_freq_khz(i) / 1000; int cpu_num = 0;
} for (int i = 0; i < max_cpu_num; ++i) {
std::string cpu_name = arm_get_cpu_name(); char path[256];
if (get_cpu_info_from_name(dev, cpu_name) != true) { snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_, FILE* fp = fopen(path, "rb");
max_freq, &dev->cluster_ids_); if (!fp) {
dev->big_core_ids_.clear(); break;
dev->little_core_ids_.clear(); }
for (int i = 0; i < dev->cluster_ids_.size(); ++i) { cpu_num++;
if (dev->cluster_ids_[i] == 0) { fclose(fp);
dev->big_core_ids_.push_back(dev->core_ids_[i]);
} else {
dev->little_core_ids_.push_back(dev->core_ids_[i]);
} }
if (cpu_num < 1) {
cpu_num = 1;
} }
arm_get_cpu_arch(&dev->archs_); return cpu_num;
#elif defined(TARGET_IOS)
int cpu_num = 0;
size_t len = sizeof(cpu_num);
sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
if (cpu_num < 1) {
cpu_num = 1;
} }
return cpu_num;
#else
return 1;
#endif
}
LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_; size_t get_mem_size() {
for (int i = 0; i < dev->compute_core_num_; ++i) { #ifdef LITE_WITH_LINUX
LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i] // get cpu count from /proc/cpuinfo
<< ", frequence: " << max_freq[i] FILE* fp = fopen("/proc/meminfo", "rb");
<< ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]] if (!fp) {
<< ", CPU ARCH: A" << dev->archs_[i]; return 1;
} }
VLOG(1) << "L1 DataCache size is: "; size_t memsize = 0;
for (int i = 0; i < dev->compute_core_num_; ++i) { char line[1024];
VLOG(1) << dev->L1_cache_[i] / 1024 << " KB"; while (!feof(fp)) {
char* s = fgets(line, 1024, fp);
if (!s) {
break;
} }
VLOG(1) << "L2 Cache size is: "; sscanf(s, "MemTotal: %d kB", &memsize);
for (int i = 0; i < dev->compute_core_num_; ++i) {
VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
} }
VLOG(1) << "Total memory: " << dev->max_memory_ << "KB"; fclose(fp);
return memsize;
#elif defined(TARGET_IOS)
// to be implemented
printf("not implemented\n");
#endif
return 0;
}
dev->max_freq_ = max_freq[0]; void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
for (int j = 1; j < dev->compute_core_num_; ++j) { archs->clear();
if (dev->max_freq_ < max_freq[j]) { #ifdef LITE_WITH_LINUX
dev->max_freq_ = max_freq[j]; //! get CPU ARCH
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
return;
}
char line[1024];
while (!feof(fp)) {
char* s = fgets(line, 1024, fp);
if (!s) {
break;
}
if (strstr(line, "part") != NULL) {
int arch_id = 0;
sscanf(s, "CPU part\t: %x", &arch_id);
switch (arch_id) {
case 0xd03:
archs->push_back(kA53);
break;
case 0xd05:
archs->push_back(kA55);
break;
case 0xd07:
archs->push_back(kA57);
break;
case 0xd08:
archs->push_back(kA72);
break;
case 0xd09:
archs->push_back(kA73);
break;
case 0xd0a:
archs->push_back(kA75);
break;
case 0xd40:
archs->push_back(kA76);
break;
case 0x804:
// 855
archs->push_back(kA76);
break;
case 0x805:
// 855
archs->push_back(kA55);
break;
case 0x802:
// 845
archs->push_back(kA75);
break;
case 0x803:
// 845
archs->push_back(kA55);
break;
case 0x801:
// 835
archs->push_back(kA73);
break;
case 0x800:
// 835
archs->push_back(kA73);
break;
case 0x205:
// 820
archs->push_back(kA72);
break;
default:
LOG(ERROR) << "Unknow cpu arch: " << arch_id;
archs->push_back(kARMArch_UNKOWN);
}
}
}
fclose(fp);
if (archs->size() < cpu_num) {
for (int i = archs->size(); i < cpu_num; ++i) {
archs->push_back(archs->at(i - 1));
} }
} }
#elif defined(TARGET_IOS) #elif defined(TARGET_IOS)
arm_get_cpu_arch(&dev->archs_); for (int i = 0; i < cpu_num; ++i) {
#endif archs->push_back(APPLE);
dev->active_ids_ = {0}; }
dev->mode_ = LITE_POWER_HIGH;
dev->workspace_.Resize({static_cast<int64_t>(
dev->L2_cache_[dev->active_ids_[0]] / sizeof(float))});
#ifdef TARGET_IOS
dev->arch_ = APPLE; // use 6x8
#else #else
if (dev->big_core_ids_.size() > 0) { for (int i = 0; i < cpu_num; ++i) {
dev->arch_ = dev->archs_[dev->big_core_ids_[0]]; archs->push_back(kARMArch_UNKOWN);
} }
#endif #endif
} }
void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
int cpu_count = arm_get_cpucount();
L1_cache_.resize(cpu_count);
L2_cache_.resize(cpu_count);
L3_cache_.resize(cpu_count);
for (int i = 0; i < cpu_count; ++i) {
L1_cache_[i] = l1size;
L2_cache_[i] = l2size;
L3_cache_[i] = l3size;
}
workspace_.Resize({2 * (l1size + l2size)});
}
void DeviceInfo::BindDev() {
#ifdef ARM_WITH_OMP
int num_threads = active_ids_.size();
omp_set_num_threads(num_threads);
#ifdef LITE_WITH_LINUX #ifdef LITE_WITH_LINUX
std::vector<int> ssarets;
for (int j = 0; j < num_threads; ++j) { std::string get_cpu_name() {
ssarets.push_back(0); FILE* fp = fopen("/proc/cpuinfo", "rb");
} if (!fp) {
#pragma omp parallel for return "";
for (int i = 0; i < num_threads; i++) {
ssarets[i] = set_sched_affinity(active_ids_);
} }
for (int i = 0; i < num_threads; i++) { char line[1024];
if (ssarets[i] != 0) { while (!feof(fp)) {
LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i]; char* s = fgets(line, 1024, fp);
return; if (!s) {
break;
} }
if (strstr(line, "Hardware") != NULL) {
fclose(fp);
return std::string(line);
} }
#endif // LITE_WITH_LINUX
#else // ARM_WITH_OMP
#ifdef LITE_WITH_LINUX
std::vector<int> cpuid1;
cpuid1.push_back(active_ids_[0]);
int ssaret = set_sched_affinity(cpuid1);
if (ssaret != 0) {
printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
return;
} }
#endif // LITE_WITH_LINUX fclose(fp);
#endif // ARM_WITH_OMP return "";
} }
void DeviceInfo::SetRunMode(PowerMode mode, int threads) { void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
int big_core_size = big_core_ids_.size(); *max_freq = 0;
int small_core_size = little_core_ids_.size(); *min_freq = 0;
if (threads > big_core_size + small_core_size) { // first try, for all possible cpu
threads = big_core_size + small_core_size; char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
FILE* fp = fopen(path, "rb");
if (!fp) {
// second try, for online cpu
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
// third try, for online cpu
// get max_freq
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
return;
} }
#ifdef ARM_WITH_OMP fscanf(fp, "%d", max_freq);
count_++; fclose(fp);
int shift_num = (count_ / 10) % big_core_size; // get min_freq
switch (mode) { snprintf(path, sizeof(path),
case LITE_POWER_FULL: "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
mode_ = mode; cpu_id);
active_ids_.clear(); fp = fopen(path, "rb");
for (int i = 0; i < threads; ++i) { if (!fp) {
if (i < big_core_size) { return;
active_ids_.push_back(big_core_ids_[i]);
} else {
active_ids_.push_back(little_core_ids_[i - big_core_size]);
} }
fscanf(fp, "%d", min_freq);
fclose(fp);
return;
} }
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
} }
*min_freq = std::numeric_limits<int>::max();
while (!feof(fp)) {
int freq = 0;
int nscan = fscanf(fp, "%d %*d", &freq);
if (nscan != 1) {
break; break;
case LITE_POWER_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
if (threads > big_core_size) {
LOG(ERROR) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(big_core_ids_[i]);
} }
if (freq > *max_freq) {
*max_freq = freq;
} }
} else { if (freq < *min_freq) {
mode_ = LITE_POWER_LOW; *min_freq = freq;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
if (threads > small_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(little_core_ids_[i]);
} }
} }
fclose(fp);
}
void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
std::vector<int>* cpu_ids,
std::vector<int>* cluster_ids) {
int cpu_num = max_freqs.size();
if (cpu_num == 0) {
return;
} }
if (active_ids_.size() == 0) { cpu_ids->resize(cpu_num);
active_ids_.push_back(0); cluster_ids->resize(cpu_num);
for (int i = 0; i < cpu_num; i++) {
cpu_ids->at(i) = i;
} }
break; // sort cpuid as big core first
case LITE_POWER_LOW: // simple bubble sort
active_ids_.clear(); for (int i = 0; i < cpu_num; i++) {
if (small_core_size > 0) { for (int j = i + 1; j < cpu_num; j++) {
mode_ = LITE_POWER_LOW; if (max_freqs[i] < max_freqs[j]) {
if (threads > small_core_size) { // swap
LOG(WARNING) << "threads: " << threads int tmp = cpu_ids->at(i);
<< ", exceed the little cores size: " << small_core_size; cpu_ids->at(i) = cpu_ids->at(j);
active_ids_ = little_core_ids_; cpu_ids->at(j) = tmp;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(little_core_ids_[i]);
} }
} }
}
// SMP
int mid_max_freq =
(max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;
for (int i = 0; i < cpu_num; i++) {
cpu_ids->at(i) = i;
if (max_freqs[i] >= mid_max_freq) {
cluster_ids->at(i) = 0;
} else { } else {
mode_ = LITE_POWER_HIGH; cluster_ids->at(i) = 1;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (threads > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(big_core_ids_[i]);
} }
} }
}
void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
int* l3_cache_size) {
int max_cache_idx_num = 10;
*l1_cache_size = DEFAULT_L1_CACHE_SIZE;
*l2_cache_size = DEFAULT_L2_CACHE_SIZE;
*l3_cache_size = DEFAULT_L3_CACHE_SIZE;
for (int i = 0; i < max_cache_idx_num; i++) {
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
FILE* fp = fopen(path, "rb");
if (fp) {
int level = -1;
fscanf(fp, "%d", &level);
fclose(fp);
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
fp = fopen(path, "rb");
if (fp) {
int size = -1;
fscanf(fp, "%d", &size);
fclose(fp);
if (size >= 0) {
if (level == 1) {
*l1_cache_size = size * 1024;
} else if (level == 2) {
*l2_cache_size = size * 1024;
} else if (level == 3) {
*l3_cache_size = size * 1024;
} }
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
} }
break;
case LITE_POWER_NO_BIND:
mode_ = LITE_POWER_NO_BIND;
active_ids_.clear();
if (threads > core_ids_.size()) {
active_ids_.resize(core_ids_.size());
} else {
active_ids_.resize(threads);
} }
break;
case LITE_POWER_RAND_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
if (threads > big_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
big_core_ids_[(i + shift_num) % big_core_size]);
} }
} }
} else { }
mode_ = LITE_POWER_LOW;
LOG(WARNING) bool check_cpu_online(const std::vector<int>& cpu_ids) {
<< "HIGH POWER MODE is not support, switch to little cores."; if (cpu_ids.size() == 0) {
if (threads > small_core_size) { return false;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(little_core_ids_[i]);
} }
char path[256];
bool all_online = true;
for (int i = 0; i < cpu_ids.size(); ++i) {
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
cpu_ids[i]);
FILE* fp = fopen(path, "rb");
int is_online = 0;
if (fp) {
fscanf(fp, "%d", &is_online);
fclose(fp);
} else {
LOG(ERROR) << "Failed to query the online statue of CPU id:"
<< cpu_ids[i];
} }
if (is_online == 0) {
all_online = false;
LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
} }
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_RAND_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
little_core_ids_[(i + shift_num) % small_core_size]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
if (threads > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(big_core_ids_[i]);
} }
return all_online;
}
int set_sched_affinity(const std::vector<int>& cpu_ids) {
// #define CPU_SETSIZE 1024
// #define __NCPUBITS (8 * sizeof (unsigned long))
// typedef struct
// {
// unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
// } cpu_set_t;
// set affinity for thread
#ifdef __GLIBC__
pid_t pid = syscall(SYS_gettid);
#else
pid_t pid = gettid();
#endif
cpu_set_t mask;
CPU_ZERO(&mask);
for (int i = 0; i < cpu_ids.size(); ++i) {
CPU_SET(cpu_ids[i], &mask);
} }
int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
if (syscallret) {
return -1;
} }
if (active_ids_.size() == 0) { return 0;
active_ids_.push_back(0); }
bool bind_threads(const std::vector<int> cpu_ids) {
#ifdef ARM_WITH_OMP
int thread_num = cpu_ids.size();
omp_set_num_threads(thread_num);
std::vector<int> ssarets;
for (int i = 0; i < thread_num; ++i) {
ssarets.push_back(0);
} }
break; #pragma omp parallel for
for (int i = 0; i < thread_num; i++) {
ssarets[i] = set_sched_affinity(cpu_ids);
} }
//! fix multi-threads LITE_POWER_HIGH mode for (int i = 0; i < thread_num; i++) {
if (mode_ == LITE_POWER_NO_BIND || threads > 1) { if (ssarets[i] != 0) {
int threads = active_ids_.size(); LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
omp_set_num_threads(threads); return false;
} else {
if (check_online(active_ids_)) {
BindDev();
} else {
LOG(WARNING) << "core id " << active_ids_[0]
<< " is offline, switch to NO BIND MODE";
int threads = active_ids_.size();
omp_set_num_threads(threads);
} }
} }
#else #else // ARM_WITH_OMP
if (big_core_size > 0) { std::vector<int> first_cpu_id;
active_ids_ = {big_core_ids_[0]}; first_cpu_id.push_back(cpu_ids[0]);
} else { int ssaret = set_sched_affinity(first_cpu_id);
active_ids_ = {0}; if (ssaret != 0) {
} LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
#endif
//! alloc memory for sgemm in this context
int temp_mem_size = L2_cache_[active_ids_[0]] / sizeof(float);
workspace_.Resize({temp_mem_size});
arch_ = archs_[active_ids_[0]];
}
bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false; return false;
} }
#endif // ARM_WITH_OMP
workspace_.Resize({static_cast<int64_t>(
count + L2_cache_[active_ids_[0]] / sizeof(float))});
return true;
} }
#endif // LITE_WITH_LINUX
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3 // cache_id : 0 -> L1, 1 -> L2, 2 -> L3
void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) { void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
va_list arg_ptr; va_list arg_ptr;
va_start(arg_ptr, argc); va_start(arg_ptr, argc);
std::vector<int>* cache; std::vector<int>* cache;
switch (cache_id) { switch (cache_id) {
case 0: case 0:
cache = &cpu_info->L1_cache_; cache = &L1_cache_;
break; break;
case 1: case 1:
cache = &cpu_info->L2_cache_; cache = &L2_cache_;
break; break;
case 2: case 2:
cache = &cpu_info->L3_cache_; cache = &L3_cache_;
break; break;
default: default:
break; break;
} }
int core_num = cpu_info->compute_core_num_; cache->resize(core_num_);
cache->resize(core_num);
if (argc == 1) { if (argc == 1) {
int cache_size = va_arg(arg_ptr, int); int cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < core_num; ++i) { for (int i = 0; i < core_num_; ++i) {
(*cache)[i] = cache_size; (*cache)[i] = cache_size;
} }
} else { } else {
int big_core_num = cpu_info->big_core_ids_.size(); int big_core_num = big_core_ids_.size();
int little_core_num = cpu_info->little_core_ids_.size(); int little_core_num = little_core_ids_.size();
int big_core_cache_size = va_arg(arg_ptr, int); int big_core_cache_size = va_arg(arg_ptr, int);
int little_core_cache_size = va_arg(arg_ptr, int); int little_core_cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < big_core_num; ++i) { for (int i = 0; i < big_core_num; ++i) {
(*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size; (*cache)[big_core_ids_[i]] = big_core_cache_size;
} }
for (int i = 0; i < little_core_num; ++i) { for (int i = 0; i < little_core_num; ++i) {
(*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size; (*cache)[little_core_ids_[i]] = little_core_cache_size;
} }
} }
va_end(arg_ptr); va_end(arg_ptr);
} }
void set_arch_info(DeviceInfo* cpu_info, int argc, ...) { void DeviceInfo::SetArchInfo(int argc, ...) {
va_list arg_ptr; va_list arg_ptr;
va_start(arg_ptr, argc); va_start(arg_ptr, argc);
int core_num = cpu_info->compute_core_num_; archs_.resize(core_num_);
cpu_info->archs_.resize(core_num);
if (argc == 1) { if (argc == 1) {
ARMArch arch = (ARMArch)va_arg(arg_ptr, int); ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
for (int i = 0; i < core_num; ++i) { for (int i = 0; i < core_num_; ++i) {
cpu_info->archs_[i] = arch; archs_[i] = arch;
} }
} else { } else {
ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int); ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int); ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
int big_core_num = cpu_info->big_core_ids_.size(); int big_core_num = big_core_ids_.size();
int little_core_num = cpu_info->little_core_ids_.size(); int little_core_num = little_core_ids_.size();
for (int i = 0; i < big_core_num; ++i) { for (int i = 0; i < big_core_num; ++i) {
cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch; archs_[big_core_ids_[i]] = big_core_arch;
} }
for (int i = 0; i < little_core_num; ++i) { for (int i = 0; i < little_core_num; ++i) {
cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch; archs_[little_core_ids_[i]] = little_core_arch;
} }
} }
va_end(arg_ptr); va_end(arg_ptr);
} }
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) { bool DeviceInfo::SetCPUInfoByName() {
/* Snapdragon */ /* Snapdragon */
if (hardware_name.find("SDM845") != std::string::npos) { // 845 if (dev_name_.find("SM8150") != std::string::npos) { // 855
cpu_info->compute_core_num_ = 8; core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7}; big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3}; little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA75, kA55); SetArchInfo(2, kA76, kA55);
set_cache_info(cpu_info, 0, 1, 32 * 1024); SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024); SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
set_cache_info(cpu_info, 2, 1, 2048 * 1024); SetCacheInfo(2, 1, 2048 * 1024);
return true; return true;
} else if (dev_name_.find("SDM845") != std::string::npos) { // 845
} else if (hardware_name.find("SDM710") != std::string::npos) { // 710 core_num_ = 8;
cpu_info->compute_core_num_ = 8; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; big_core_ids_ = {4, 5, 6, 7};
cpu_info->big_core_ids_ = {6, 7}; little_core_ids_ = {0, 1, 2, 3};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5}; cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0}; SetArchInfo(2, kA75, kA55);
set_arch_info(cpu_info, 2, kA75, kA55); SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 2048 * 1024);
return true;
} else if (dev_name_.find("SDM710") != std::string::npos) { // 710
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {6, 7};
little_core_ids_ = {0, 1, 2, 3, 4, 5};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA75, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 1024 * 1024);
return true; return true;
} else if (hardware_name.find("MSM8998") != std::string::npos) { // 835 } else if (dev_name_.find("MSM8998") != std::string::npos) { // 835
cpu_info->compute_core_num_ = 8; core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7}; big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3}; little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53); SetArchInfo(2, kA73, kA53);
set_cache_info(cpu_info, 0, 2, 64 * 1024); SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, SetCacheInfo(1, 2, 1024 * 1024,
/*real cache size is 2M, while that will get bad performace /*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/ on conv3x3s1 or gemm, set to 1M or 512K*/
1024 * 1024); 1024 * 1024);
return true; return true;
} else if (dev_name_.find("MSM8996") != std::string::npos) { // 820
} else if (hardware_name.find("MSM8996") != std::string::npos) { // 820 core_num_ = 4;
cpu_info->compute_core_num_ = 4; core_ids_ = {0, 1, 2, 3};
cpu_info->core_ids_ = {0, 1, 2, 3}; big_core_ids_ = {2, 3};
cpu_info->big_core_ids_ = {2, 3}; little_core_ids_ = {0, 1};
cpu_info->little_core_ids_ = {0, 1}; cluster_ids_ = {1, 1, 0, 0};
cpu_info->cluster_ids_ = {1, 1, 0, 0}; SetArchInfo(1, kA72);
set_arch_info(cpu_info, 1, kA72); SetCacheInfo(0, 1, 24 * 1024);
set_cache_info(cpu_info, 0, 1, 24 * 1024); SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true; return true;
} else if (dev_name_.find("SDM660") != std::string::npos ||
} else if (hardware_name.find("SDM660") != std::string::npos || dev_name_.find("SDM636") != std::string::npos) { // 660, 636
hardware_name.find("SDM636") != std::string::npos) { // 660, 636 core_num_ = 8;
cpu_info->compute_core_num_ = 8; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; big_core_ids_ = {4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7}; little_core_ids_ = {0, 1, 2, 3};
cpu_info->little_core_ids_ = {0, 1, 2, 3}; cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; SetArchInfo(1, kA73);
set_arch_info(cpu_info, 1, kA73); SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024); SetCacheInfo(1, 1, 1024 * 1024);
set_cache_info(cpu_info, 1, 1, 1024 * 1024);
return true; return true;
} else if (dev_name_.find("MSM8976") != std::string::npos) { // 652,653
} else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653 core_num_ = 8;
cpu_info->compute_core_num_ = 8; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; big_core_ids_ = {4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7}; little_core_ids_ = {0, 1, 2, 3};
cpu_info->little_core_ids_ = {0, 1, 2, 3}; cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; SetArchInfo(2, kA72, kA53);
set_arch_info(cpu_info, 2, kA72, kA53); SetCacheInfo(0, 1, 32 * 1024);
set_cache_info(cpu_info, 0, 1, 32 * 1024); SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true; return true;
} else if (dev_name_.find("MSM8953") != std::string::npos) { // 625
} else if (hardware_name.find("MSM8953") != std::string::npos) { // 625 core_num_ = 8;
cpu_info->compute_core_num_ = 8; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; little_core_ids_ = {};
cpu_info->little_core_ids_ = {}; cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; SetArchInfo(1, kA53);
set_arch_info(cpu_info, 1, kA53); SetCacheInfo(0, 1, 32 * 1024);
set_cache_info(cpu_info, 0, 1, 32 * 1024); SetCacheInfo(1, 1, 1024 * 1024);
set_cache_info(cpu_info, 1, 1, 1024 * 1024);
return true; return true;
} else if (dev_name_.find("MSM8939") != std::string::npos) { // 615
} else if (hardware_name.find("MSM8939") != std::string::npos) { // 615 core_num_ = 8;
cpu_info->compute_core_num_ = 8; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; big_core_ids_ = {0, 1, 2, 3};
cpu_info->big_core_ids_ = {0, 1, 2, 3}; little_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {4, 5, 6, 7}; cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1}; SetArchInfo(1, kA53);
set_arch_info(cpu_info, 1, kA53); SetCacheInfo(0, 1, 32 * 1024);
set_cache_info(cpu_info, 0, 1, 32 * 1024); SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
return true; return true;
/* MediaTek */ /* MediaTek */
} else if (dev_name_.find("MT6797") !=
} else if (hardware_name.find("MT6797") !=
std::string::npos) { // X20/X23/X25/X27 std::string::npos) { // X20/X23/X25/X27
cpu_info->compute_core_num_ = 10; core_num_ = 10;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
cpu_info->big_core_ids_ = {8, 9}; big_core_ids_ = {8, 9};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA72, kA53); SetArchInfo(2, kA72, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024); SetCacheInfo(0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true; return true;
} else if (dev_name_.find("MT6799") != std::string::npos) { // X30
} else if (hardware_name.find("MT6799") != std::string::npos) { // X30 core_num_ = 10;
cpu_info->compute_core_num_ = 10; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; big_core_ids_ = {8, 9};
cpu_info->big_core_ids_ = {8, 9}; little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; SetArchInfo(2, kA73, kA53);
set_arch_info(cpu_info, 2, kA73, kA53);
return true; return true;
} else if (dev_name_.find("MT6795") != std::string::npos ||
} else if (hardware_name.find("MT6795") != std::string::npos || dev_name_.find("MT6762") != std::string::npos ||
hardware_name.find("MT6762") != std::string::npos || dev_name_.find("MT6755T") != std::string::npos ||
hardware_name.find("MT6755T") != std::string::npos || dev_name_.find("MT6755S") != std::string::npos ||
hardware_name.find("MT6755S") != std::string::npos || dev_name_.find("MT6753") != std::string::npos ||
hardware_name.find("MT6753") != std::string::npos || dev_name_.find("MT6752") != std::string::npos ||
hardware_name.find("MT6752") != std::string::npos || dev_name_.find("MT6750") != std::string::npos) {
hardware_name.find("MT6750") != std::string::npos) {
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750 // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
cpu_info->compute_core_num_ = 8; core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->little_core_ids_ = {}; little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53); SetArchInfo(1, kA53);
return true; return true;
} else if (dev_name_.find("MT6758") != std::string::npos ||
} else if (hardware_name.find("MT6758") != std::string::npos || dev_name_.find("MT6757") != std::string::npos ||
hardware_name.find("MT6757") != std::string::npos || dev_name_.find("MT6763") != std::string::npos ||
hardware_name.find("MT6763") != std::string::npos || dev_name_.find("MT6755M") != std::string::npos ||
hardware_name.find("MT6755M") != std::string::npos || dev_name_.find("MT6755") !=
hardware_name.find("MT6755") !=
std::string::npos) { // P30, P20/P25, P23, P10 std::string::npos) { // P30, P20/P25, P23, P10
cpu_info->compute_core_num_ = 8; core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7}; big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3}; little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53); SetArchInfo(1, kA53);
return true; return true;
} else if (dev_name_.find("MT6771") != std::string::npos) { // P60
} else if (hardware_name.find("MT6771") != std::string::npos) { // P60 core_num_ = 8;
cpu_info->compute_core_num_ = 8; core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; big_core_ids_ = {4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7}; little_core_ids_ = {0, 1, 2, 3};
cpu_info->little_core_ids_ = {0, 1, 2, 3}; cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; SetArchInfo(2, kA73, kA53);
set_arch_info(cpu_info, 2, kA73, kA53);
return true; return true;
} else if (dev_name_.find("MT6765") != std::string::npos ||
} else if (hardware_name.find("MT6765") != std::string::npos || dev_name_.find("MT6739") != std::string::npos ||
hardware_name.find("MT6739") != std::string::npos || dev_name_.find("MT6738") != std::string::npos ||
hardware_name.find("MT6738") != std::string::npos || dev_name_.find("MT6737") !=
hardware_name.find("MT6737") !=
std::string::npos) { // A22, MT6739, MT6738, MT6767 std::string::npos) { // A22, MT6739, MT6738, MT6767
cpu_info->compute_core_num_ = 4; core_num_ = 4;
cpu_info->core_ids_ = {0, 1, 2, 3}; core_ids_ = {0, 1, 2, 3};
cpu_info->big_core_ids_ = {0, 0, 0, 0}; big_core_ids_ = {0, 1, 2, 3};
cpu_info->little_core_ids_ = {}; little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0}; cluster_ids_ = {0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53); SetArchInfo(1, kA53);
return true; return true;
} }
return false; return false;
} }
size_t arm_get_meminfo() { void DeviceInfo::SetCPUInfoByProb() {
#ifdef LITE_WITH_LINUX #ifdef LITE_WITH_LINUX
// get cpu count from /proc/cpuinfo // get big.LITTLE cores by sorting CPU frequency
FILE* fp = fopen("/proc/meminfo", "rb"); sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
if (!fp) { big_core_ids_.clear();
return 1; little_core_ids_.clear();
for (int i = 0; i < cluster_ids_.size(); ++i) {
if (cluster_ids_[i] == 0) {
big_core_ids_.push_back(core_ids_[i]);
} else {
little_core_ids_.push_back(core_ids_[i]);
} }
size_t memsize = 0;
char line[1024];
while (!feof(fp)) {
char* s = fgets(line, 1024, fp);
if (!s) {
break;
} }
sscanf(s, "MemTotal: %d kB", &memsize); // get l1, l2, l3 cache size for each core
for (int i = 0; i < core_num_; i++) {
get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
} }
#endif // LITE_WITH_LINUX
fclose(fp);
return memsize;
#elif defined(TARGET_IOS)
// to be implemented
printf("not implemented\n");
return 0;
#endif
} }
int arm_get_cpucount() { void DeviceInfo::RequestPowerFullMode(const int thread_num) {
#ifdef LITE_WITH_LINUX int big_core_size = big_core_ids_.size();
// get cpu count from /sys/devices/system/cpu/cpunum/uevent int little_core_size = little_core_ids_.size();
int max_cpu_count = 20; active_ids_.clear();
int count = 0; for (int i = 0; i < thread_num; ++i) {
for (int i = 0; i < max_cpu_count; ++i) { if (i < big_core_size) {
char path[256]; active_ids_.push_back(big_core_ids_[i]);
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); } else if (i < big_core_size + little_core_size) {
FILE* fp = fopen(path, "rb"); active_ids_.push_back(little_core_ids_[i - big_core_size]);
if (!fp) {
break;
}
count++;
fclose(fp);
}
if (count < 1) {
count = 1;
} }
return count;
#elif defined(TARGET_IOS)
int count = 0;
size_t len = sizeof(count);
sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
if (count < 1) {
count = 1;
} }
return count; mode_ = LITE_POWER_FULL;
#else
return 1;
#endif
} }
void arm_get_cpu_arch(std::vector<ARMArch>* archs) { void DeviceInfo::RequestPowerHighMode(const int thread_num) {
#ifdef LITE_WITH_LINUX int big_core_size = big_core_ids_.size();
archs->clear(); int little_core_size = little_core_ids_.size();
//! get CPU ARCH active_ids_.clear();
FILE* fp = fopen("/proc/cpuinfo", "rb"); if (big_core_size > 0) {
if (!fp) { mode_ = LITE_POWER_HIGH;
return; if (thread_num > big_core_size) {
} LOG(ERROR) << "Request thread num: " << thread_num
char line[1024]; << ", exceed the big cores size: " << big_core_size
while (!feof(fp)) { << ", truncate thread num to " << big_core_size;
char* s = fgets(line, 1024, fp); active_ids_ = big_core_ids_;
if (!s) { } else {
break; for (int i = 0; i < thread_num; ++i) {
} active_ids_.push_back(big_core_ids_[i]);
if (strstr(line, "part") != NULL) {
int arch_id = 0;
sscanf(s, "CPU part\t: %x", &arch_id);
switch (arch_id) {
case 0xd03:
archs->push_back(kA53);
break;
case 0xd05:
archs->push_back(kA55);
break;
case 0xd07:
archs->push_back(kA57);
break;
case 0xd08:
archs->push_back(kA72);
break;
case 0xd09:
archs->push_back(kA73);
break;
case 0xd0a:
archs->push_back(kA75);
break;
case 0x800:
// 835
archs->push_back(kA73);
break;
case 0x205:
// 820
archs->push_back(kA72);
break;
default:
LOG(ERROR) << "unknow type";
archs->push_back(kARMArch_UNKOWN);
}
} }
} }
fclose(fp); } else {
int cpu_count = arm_get_cpucount(); mode_ = LITE_POWER_LOW;
if (archs->size() < cpu_count) { LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
for (int i = archs->size(); i < cpu_count; ++i) { if (thread_num > little_core_size) {
archs->push_back(archs->at(i - 1)); active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(little_core_ids_[i]);
} }
} }
#endif
#ifdef TARGET_IOS
int cpu_count = arm_get_cpucount();
for (int i = 0; i < cpu_count; ++i) {
archs->push_back(APPLE);
} }
#endif
} }
#ifdef LITE_WITH_LINUX void DeviceInfo::RequestPowerLowMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
void set_default_cache(DeviceInfo* dev) { int little_core_size = little_core_ids_.size();
int cpu_count = arm_get_cpucount(); active_ids_.clear();
dev->L1_cache_.resize(cpu_count); if (little_core_size > 0) {
dev->L2_cache_.resize(cpu_count); mode_ = LITE_POWER_LOW;
dev->L3_cache_.resize(cpu_count); if (thread_num > little_core_size) {
#ifdef TARGET_IOS LOG(WARNING) << "Request thread num: " << thread_num
for (int i = 0; i < cpu_count; ++i) { << ", exceed the little cores size: " << little_core_size
dev->L1_cache_[i] = 64 * 1024; << ", truncate thread num to " << little_core_size;
dev->L2_cache_[i] = 2048 * 1024; active_ids_ = little_core_ids_;
dev->L3_cache_[i] = 0; } else {
} for (int i = 0; i < thread_num; i++) {
#else active_ids_.push_back(little_core_ids_[i]);
for (int i = 0; i < cpu_count; ++i) {
dev->L1_cache_[i] = 32 * 1024;
dev->L2_cache_[i] = 512 * 1024;
dev->L3_cache_[i] = 0;
} }
#endif
}
std::string arm_get_cpu_name() {
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
return "";
} }
char line[1024]; } else {
while (!feof(fp)) { mode_ = LITE_POWER_HIGH;
char* s = fgets(line, 1024, fp); LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (!s) { if (thread_num > big_core_size) {
break; active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(big_core_ids_[i]);
} }
if (strstr(line, "Hardware") != NULL) {
fclose(fp);
return std::string(line);
} }
} }
fclose(fp);
return "";
} }
int get_max_freq_khz(int cpuid) { void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
// first try, for all possible cpu active_ids_.clear();
char path[256]; for (int i = 0; i < thread_num; i++) {
snprintf(path, sizeof(path), active_ids_.push_back(0);
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
FILE* fp = fopen(path, "rb");
if (!fp) {
// second try, for online cpu
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
cpuid);
fp = fopen(path, "rb");
if (!fp) {
// third try, for online cpu
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
fp = fopen(path, "rb");
if (!fp) {
return -1;
} }
mode_ = LITE_POWER_NO_BIND;
}
int max_freq_khz = -1; void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
fscanf(fp, "%d", &max_freq_khz); const int thread_num) {
int big_core_size = big_core_ids_.size();
fclose(fp); int little_core_size = little_core_ids_.size();
if (big_core_size > 0) {
return max_freq_khz; mode_ = LITE_POWER_RAND_HIGH;
if (thread_num > big_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
<< ", truncate thread num to " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
} }
} }
} else {
int max_freq_khz = 0; mode_ = LITE_POWER_LOW;
while (!feof(fp)) { LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
int freq_khz = 0; if (thread_num > little_core_size) {
int nscan = fscanf(fp, "%d %*d", &freq_khz); active_ids_ = little_core_ids_;
if (nscan != 1) { } else {
break; for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(little_core_ids_[i]);
} }
if (freq_khz > max_freq_khz) {
max_freq_khz = freq_khz;
} }
} }
fclose(fp);
return max_freq_khz;
} }
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids, void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
const std::vector<int>& cpu_freq, const int thread_num) {
std::vector<int>* cluster_ids) { int big_core_size = big_core_ids_.size();
if (cpu_count == 0) { int little_core_size = little_core_ids_.size();
return 0; active_ids_.clear();
} if (little_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
cpuids->resize(cpu_count); if (thread_num > little_core_size) {
cluster_ids->resize(cpu_count); LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
for (int i = 0; i < cpu_count; i++) { << ", truncate thread num to " << little_core_size;
cpuids->at(i) = i; active_ids_ = little_core_ids_;
} } else {
for (int i = 0; i < thread_num; ++i) {
// sort cpuid as big core first active_ids_.push_back(
// simple bubble sort little_core_ids_[(i + shift_num) % little_core_size]);
for (int i = 0; i < cpu_count; i++) {
for (int j = i + 1; j < cpu_count; j++) {
if (cpu_freq[i] < cpu_freq[j]) {
// swap
int tmp = cpuids->at(i);
cpuids->at(i) = cpuids->at(j);
cpuids->at(j) = tmp;
}
} }
} }
// SMP
int mid_max_freq_khz =
(cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
for (int i = 0; i < cpu_count; i++) {
cpuids->at(i) = i;
if (cpu_freq[i] >= mid_max_freq_khz) {
cluster_ids->at(i) = 0;
} else { } else {
cluster_ids->at(i) = 1; mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
} }
} }
return 0;
} }
int check_online(const std::vector<int>& core_ids) { int DeviceInfo::Setup() {
if (core_ids.size() == 0) { core_num_ = get_cpu_num();
return 0; mem_size_ = get_mem_size();
} get_cpu_arch(&archs_, core_num_);
char path[256]; // set defalut CPU info
int online = 1; SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE);
for (int i = 0; i < core_ids.size(); ++i) { SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE);
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online", SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE);
core_ids[i]); #ifdef LITE_WITH_LINUX
FILE* fp = fopen(path, "rb"); // get max&min freq
if (!fp) { max_freqs_.resize(core_num_);
min_freqs_.resize(core_num_);
for (int i = 0; i < core_num_; ++i) {
int max_freq, min_freq;
get_cpu_max_min_freq(i, &max_freq, &min_freq);
max_freqs_[i] = max_freq / 1000;
min_freqs_[i] = min_freq / 1000;
}
// get cache size and big.LITTLE core ids
dev_name_ = get_cpu_name();
if (!SetCPUInfoByName()) {
SetCPUInfoByProb();
}
// output info
LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
LOG(INFO) << "ARM multiprocessors number: " << core_num_;
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
<< ", max freq: " << max_freqs_[i]
<< ", min freq: " << min_freqs_[i]
<< ", cluster ID: " << cluster_ids_[core_ids_[i]]
<< ", CPU ARCH: A" << archs_[i];
}
LOG(INFO) << "L1 DataCache size is: ";
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << L1_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "L2 Cache size is: ";
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << L2_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "Total memory: " << mem_size_ << "KB";
#endif
// set default run mode
SetRunMode(LITE_POWER_NO_BIND, 1); // use single thread by default
return 0; return 0;
}
int cur_online = 0;
fscanf(fp, "%d", &cur_online);
online &= cur_online;
fclose(fp);
}
return online;
} }
int set_sched_affinity(const std::vector<int>& cpuids) { void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
// #define CPU_SETSIZE 1024 #ifdef ARM_WITH_OMP
// #define __NCPUBITS (8 * sizeof (unsigned long)) thread_num = std::min(thread_num, core_num_);
// typedef struct
// {
// unsigned long __bits[CPU_SETSIZE / __NCPUBITS];
// } cpu_set_t;
// set affinity for thread
#ifdef __GLIBC__
pid_t pid = syscall(SYS_gettid);
#else #else
pid_t pid = gettid(); thread_num = 1; // force thread_num to 1 if OpenMP is disabled
#endif #endif
cpu_set_t mask; #ifdef LITE_WITH_LINUX
CPU_ZERO(&mask); int big_core_size = big_core_ids_.size();
for (int i = 0; i < cpuids.size(); i++) { int little_core_size = little_core_ids_.size();
CPU_SET(cpuids[i], &mask); int big_little_core_size = big_core_size + little_core_size;
thread_num = std::min(thread_num, big_little_core_size);
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
RequestPowerFullMode(thread_num);
break;
case LITE_POWER_HIGH:
RequestPowerHighMode(thread_num);
break;
case LITE_POWER_LOW:
RequestPowerLowMode(thread_num);
break;
case LITE_POWER_NO_BIND:
RequestPowerNoBindMode(thread_num);
break;
case LITE_POWER_RAND_HIGH:
RequestPowerRandHighMode(shift_num, thread_num);
break;
case LITE_POWER_RAND_LOW:
RequestPowerRandLowMode(shift_num, thread_num);
break;
default:
LOG(FATAL) << "Unsupported power mode: " << mode;
break;
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
if (mode_ != LITE_POWER_NO_BIND) {
if (check_cpu_online(active_ids_)) {
bind_threads(active_ids_);
} else {
LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
mode_ = LITE_POWER_NO_BIND;
} }
int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
if (syscallret) {
LOG(ERROR) << "syscall error " << syscallret;
return -1;
} }
#else // LITE_WITH_LINUX
// only LITE_POWER_NO_BIND is supported in other OS
RequestPowerNoBindMode(thread_num);
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
#endif // LITE_WITH_LINUX
//! alloc memory for sgemm in this context
workspace_.Resize(
{static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
arch_ = archs_[active_ids_[0]];
}
return 0; void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
SetCacheInfo(0, l1size);
SetCacheInfo(1, l2size);
SetCacheInfo(2, l3size);
workspace_.Resize({2 * (l1size + l2size)});
} }
#endif // LITE_WITH_LINUX bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false;
}
workspace_.Resize({static_cast<int64_t>(
count + L2_cache_[active_ids_[0]] / sizeof(float))});
return true;
}
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#pragma once #pragma once
#include <cstdarg>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/lite/core/lite_tensor.h" #include "paddle/fluid/lite/core/lite_tensor.h"
...@@ -47,92 +48,73 @@ typedef enum { ...@@ -47,92 +48,73 @@ typedef enum {
class DeviceInfo { class DeviceInfo {
public: public:
int idx_;
int max_freq_;
int min_freq_;
int generate_arch_;
int compute_core_num_;
int max_memory_;
int sharemem_size_;
std::string device_name_;
std::string compute_ability_;
std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
ARMArch arch_;
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
static DeviceInfo& Global() { static DeviceInfo& Global() {
static auto* x = new DeviceInfo; static auto* x = new DeviceInfo;
return *x; return *x;
} }
static void Init() { static int Init() {
auto& info = Global(); static int ret = Global().Setup();
InitInternal(&info); return ret;
} }
void SetRunMode(PowerMode mode, int threads); int Setup();
void SetRunMode(PowerMode mode, int thread_num);
void SetCache(int l1size, int l2size, int l3size); void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; } void SetArch(ARMArch arch) { arch_ = arch; }
void BindDev();
PowerMode mode() const { return mode_; } PowerMode mode() const { return mode_; }
int threads() const { return active_ids_.size(); } int threads() const { return active_ids_.size(); }
ARMArch arch() const { return arch_; } ARMArch arch() const { return arch_; }
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
template <typename T> template <typename T>
T* workspace_data() { T* workspace_data() {
return workspace_.mutable_data<T>(); return workspace_.mutable_data<T>();
} }
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
bool ExtendWorkspace(DDimLite dims); bool ExtendWorkspace(DDimLite dims);
private: private:
DeviceInfo() = default; int core_num_;
static void InitInternal(DeviceInfo* dev); std::vector<int> max_freqs_;
}; std::vector<int> min_freqs_;
int mem_size_;
size_t arm_get_meminfo(); std::string dev_name_;
int arm_get_cpucount();
void arm_get_cpu_arch(std::vector<ARMArch>* archs);
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
#ifdef LITE_WITH_LINUX std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
void set_default_cache(DeviceInfo* dev); std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::string arm_get_cpu_name(); std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
int get_max_freq_khz(int cpuid); ARMArch arch_;
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids, void SetCacheInfo(int cache_id, int argc, ...);
const std::vector<int>& cpu_freq, void SetArchInfo(int argc, ...);
std::vector<int>* cluster_ids); bool SetCPUInfoByName();
int check_online(const std::vector<int>& core_ids); void SetCPUInfoByProb();
int set_sched_affinity(const std::vector<int>& cpuids); void RequestPowerFullMode(const int thread_num);
void RequestPowerHighMode(const int thread_num);
void RequestPowerLowMode(const int thread_num);
void RequestPowerNoBindMode(const int thread_num);
void RequestPowerRandHighMode(const int shift_num, const int thread_num);
void RequestPowerRandLowMode(const int shift_num, const int thread_num);
#endif // LITE_WITH_LINUX DeviceInfo() = default;
};
#endif // LITE_WITH_ARM #endif // LITE_WITH_ARM
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册