提交 ce46ef22 编写于 作者: H hong19860320

ARM cpu_info refine

test=develop
上级 251255bc
......@@ -67,7 +67,7 @@ class Context<TargetType::kARM> {
ARMContext& operator=(const ARMContext& ctx) {}
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() {}
void InitOnce() { DeviceInfo::Init(); }
void CopyShared(const ARMContext* ctx) {}
......@@ -78,20 +78,19 @@ class Context<TargetType::kARM> {
return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
}
void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
void BindDev() { return DeviceInfo::Global().BindDev(); }
PowerMode mode() const { return DeviceInfo::Global().mode(); }
int threads() const { return DeviceInfo::Global().threads(); }
ARMArch arch() const { return DeviceInfo::Global().arch(); }
int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
template <typename T>
T* workspace_data() {
return DeviceInfo::Global().workspace_data<T>();
}
int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
bool ExtendWorkspace(DDimLite dims) {
return DeviceInfo::Global().ExtendWorkspace(dims);
}
......
......@@ -29,7 +29,8 @@
#include <omp.h>
#endif
#include <cstdarg>
#include <algorithm>
#include <limits>
#include "paddle/fluid/lite/core/cpu_info.h"
namespace paddle {
......@@ -37,549 +38,55 @@ namespace lite {
#ifdef LITE_WITH_ARM
void DeviceInfo::InitInternal(DeviceInfo* dev) {
set_default_cache(dev);
dev->compute_core_num_ = arm_get_cpucount();
dev->max_memory_ = arm_get_meminfo();
// get max freq
#ifdef LITE_WITH_LINUX
std::vector<int> max_freq(dev->compute_core_num_);
for (int i = 0; i < dev->compute_core_num_; ++i) {
max_freq[i] = get_max_freq_khz(i) / 1000;
}
std::string cpu_name = arm_get_cpu_name();
if (get_cpu_info_from_name(dev, cpu_name) != true) {
arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_,
max_freq, &dev->cluster_ids_);
dev->big_core_ids_.clear();
dev->little_core_ids_.clear();
for (int i = 0; i < dev->cluster_ids_.size(); ++i) {
if (dev->cluster_ids_[i] == 0) {
dev->big_core_ids_.push_back(dev->core_ids_[i]);
} else {
dev->little_core_ids_.push_back(dev->core_ids_[i]);
}
}
arm_get_cpu_arch(&dev->archs_);
}
LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_;
for (int i = 0; i < dev->compute_core_num_; ++i) {
LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i]
<< ", frequence: " << max_freq[i]
<< ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
<< ", CPU ARCH: A" << dev->archs_[i];
}
VLOG(1) << "L1 DataCache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) {
VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
}
VLOG(1) << "L2 Cache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) {
VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
}
VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
dev->max_freq_ = max_freq[0];
for (int j = 1; j < dev->compute_core_num_; ++j) {
if (dev->max_freq_ < max_freq[j]) {
dev->max_freq_ = max_freq[j];
}
}
#elif defined(TARGET_IOS)
arm_get_cpu_arch(&dev->archs_);
#endif
dev->active_ids_ = {0};
dev->mode_ = LITE_POWER_HIGH;
dev->workspace_.Resize({static_cast<int64_t>(
dev->L2_cache_[dev->active_ids_[0]] / sizeof(float))});
#ifdef TARGET_IOS
dev->arch_ = APPLE; // use 6x8
const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
const int DEFAULT_L3_CACHE_SIZE = 0;
#else
if (dev->big_core_ids_.size() > 0) {
dev->arch_ = dev->archs_[dev->big_core_ids_[0]];
}
const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
const int DEFAULT_L3_CACHE_SIZE = 0;
#endif
}
void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
int cpu_count = arm_get_cpucount();
L1_cache_.resize(cpu_count);
L2_cache_.resize(cpu_count);
L3_cache_.resize(cpu_count);
for (int i = 0; i < cpu_count; ++i) {
L1_cache_[i] = l1size;
L2_cache_[i] = l2size;
L3_cache_[i] = l3size;
}
workspace_.Resize({2 * (l1size + l2size)});
}
void DeviceInfo::BindDev() {
#ifdef ARM_WITH_OMP
int num_threads = active_ids_.size();
omp_set_num_threads(num_threads);
int get_cpu_num() {
#ifdef LITE_WITH_LINUX
std::vector<int> ssarets;
for (int j = 0; j < num_threads; ++j) {
ssarets.push_back(0);
}
#pragma omp parallel for
for (int i = 0; i < num_threads; i++) {
ssarets[i] = set_sched_affinity(active_ids_);
}
for (int i = 0; i < num_threads; i++) {
if (ssarets[i] != 0) {
LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
return;
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
int max_cpu_num = 20;
int cpu_num = 0;
for (int i = 0; i < max_cpu_num; ++i) {
char path[256];
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
FILE* fp = fopen(path, "rb");
if (!fp) {
break;
}
cpu_num++;
fclose(fp);
}
#endif // LITE_WITH_LINUX
#else // ARM_WITH_OMP
#ifdef LITE_WITH_LINUX
std::vector<int> cpuid1;
cpuid1.push_back(active_ids_[0]);
int ssaret = set_sched_affinity(cpuid1);
if (ssaret != 0) {
printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
return;
}
#endif // LITE_WITH_LINUX
#endif // ARM_WITH_OMP
}
void DeviceInfo::SetRunMode(PowerMode mode, int threads) {
int big_core_size = big_core_ids_.size();
int small_core_size = little_core_ids_.size();
if (threads > big_core_size + small_core_size) {
threads = big_core_size + small_core_size;
if (cpu_num < 1) {
cpu_num = 1;
}
#ifdef ARM_WITH_OMP
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
mode_ = mode;
active_ids_.clear();
for (int i = 0; i < threads; ++i) {
if (i < big_core_size) {
active_ids_.push_back(big_core_ids_[i]);
} else {
active_ids_.push_back(little_core_ids_[i - big_core_size]);
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
if (threads > big_core_size) {
LOG(ERROR) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
if (threads > small_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (threads > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_NO_BIND:
mode_ = LITE_POWER_NO_BIND;
active_ids_.clear();
if (threads > core_ids_.size()) {
active_ids_.resize(core_ids_.size());
} else {
active_ids_.resize(threads);
}
break;
case LITE_POWER_RAND_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
if (threads > big_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
big_core_ids_[(i + shift_num) % big_core_size]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(WARNING)
<< "HIGH POWER MODE is not support, switch to little cores.";
if (threads > small_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_RAND_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
little_core_ids_[(i + shift_num) % small_core_size]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
if (threads > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
}
//! fix multi-threads LITE_POWER_HIGH mode
if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
int threads = active_ids_.size();
omp_set_num_threads(threads);
} else {
if (check_online(active_ids_)) {
BindDev();
} else {
LOG(WARNING) << "core id " << active_ids_[0]
<< " is offline, switch to NO BIND MODE";
int threads = active_ids_.size();
omp_set_num_threads(threads);
}
return cpu_num;
#elif defined(TARGET_IOS)
int cpu_num = 0;
size_t len = sizeof(cpu_num);
sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
if (cpu_num < 1) {
cpu_num = 1;
}
return cpu_num;
#else
if (big_core_size > 0) {
active_ids_ = {big_core_ids_[0]};
} else {
active_ids_ = {0};
}
return 1;
#endif
//! alloc memory for sgemm in this context
int temp_mem_size = L2_cache_[active_ids_[0]] / sizeof(float);
workspace_.Resize({temp_mem_size});
arch_ = archs_[active_ids_[0]];
}
bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false;
}
workspace_.Resize({static_cast<int64_t>(
count + L2_cache_[active_ids_[0]] / sizeof(float))});
return true;
}
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
std::vector<int>* cache;
switch (cache_id) {
case 0:
cache = &cpu_info->L1_cache_;
break;
case 1:
cache = &cpu_info->L2_cache_;
break;
case 2:
cache = &cpu_info->L3_cache_;
break;
default:
break;
}
int core_num = cpu_info->compute_core_num_;
cache->resize(core_num);
if (argc == 1) {
int cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < core_num; ++i) {
(*cache)[i] = cache_size;
}
} else {
int big_core_num = cpu_info->big_core_ids_.size();
int little_core_num = cpu_info->little_core_ids_.size();
int big_core_cache_size = va_arg(arg_ptr, int);
int little_core_cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < big_core_num; ++i) {
(*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size;
}
for (int i = 0; i < little_core_num; ++i) {
(*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size;
}
}
va_end(arg_ptr);
}
void set_arch_info(DeviceInfo* cpu_info, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
int core_num = cpu_info->compute_core_num_;
cpu_info->archs_.resize(core_num);
if (argc == 1) {
ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
for (int i = 0; i < core_num; ++i) {
cpu_info->archs_[i] = arch;
}
} else {
ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
int big_core_num = cpu_info->big_core_ids_.size();
int little_core_num = cpu_info->little_core_ids_.size();
for (int i = 0; i < big_core_num; ++i) {
cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch;
}
for (int i = 0; i < little_core_num; ++i) {
cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch;
}
}
va_end(arg_ptr);
}
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) {
/* Snapdragon */
if (hardware_name.find("SDM845") != std::string::npos) { // 845
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA75, kA55);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024);
set_cache_info(cpu_info, 2, 1, 2048 * 1024);
return true;
} else if (hardware_name.find("SDM710") != std::string::npos) { // 710
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA75, kA55);
return true;
} else if (hardware_name.find("MSM8998") != std::string::npos) { // 835
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
set_cache_info(cpu_info, 0, 2, 64 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024,
/*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/
1024 * 1024);
return true;
} else if (hardware_name.find("MSM8996") != std::string::npos) { // 820
cpu_info->compute_core_num_ = 4;
cpu_info->core_ids_ = {0, 1, 2, 3};
cpu_info->big_core_ids_ = {2, 3};
cpu_info->little_core_ids_ = {0, 1};
cpu_info->cluster_ids_ = {1, 1, 0, 0};
set_arch_info(cpu_info, 1, kA72);
set_cache_info(cpu_info, 0, 1, 24 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("SDM660") != std::string::npos ||
hardware_name.find("SDM636") != std::string::npos) { // 660, 636
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA73);
set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024);
set_cache_info(cpu_info, 1, 1, 1024 * 1024);
return true;
} else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA72, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("MSM8953") != std::string::npos) { // 625
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 1, 1024 * 1024);
return true;
} else if (hardware_name.find("MSM8939") != std::string::npos) { // 615
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3};
cpu_info->little_core_ids_ = {4, 5, 6, 7};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
set_arch_info(cpu_info, 1, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
return true;
/* MediaTek */
} else if (hardware_name.find("MT6797") !=
std::string::npos) { // X20/X23/X25/X27
cpu_info->compute_core_num_ = 10;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
cpu_info->big_core_ids_ = {8, 9};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA72, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("MT6799") != std::string::npos) { // X30
cpu_info->compute_core_num_ = 10;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
cpu_info->big_core_ids_ = {8, 9};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
return true;
} else if (hardware_name.find("MT6795") != std::string::npos ||
hardware_name.find("MT6762") != std::string::npos ||
hardware_name.find("MT6755T") != std::string::npos ||
hardware_name.find("MT6755S") != std::string::npos ||
hardware_name.find("MT6753") != std::string::npos ||
hardware_name.find("MT6752") != std::string::npos ||
hardware_name.find("MT6750") != std::string::npos) {
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
} else if (hardware_name.find("MT6758") != std::string::npos ||
hardware_name.find("MT6757") != std::string::npos ||
hardware_name.find("MT6763") != std::string::npos ||
hardware_name.find("MT6755M") != std::string::npos ||
hardware_name.find("MT6755") !=
std::string::npos) { // P30, P20/P25, P23, P10
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
} else if (hardware_name.find("MT6771") != std::string::npos) { // P60
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
return true;
} else if (hardware_name.find("MT6765") != std::string::npos ||
hardware_name.find("MT6739") != std::string::npos ||
hardware_name.find("MT6738") != std::string::npos ||
hardware_name.find("MT6737") !=
std::string::npos) { // A22, MT6739, MT6738, MT6767
cpu_info->compute_core_num_ = 4;
cpu_info->core_ids_ = {0, 1, 2, 3};
cpu_info->big_core_ids_ = {0, 0, 0, 0};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
}
return false;
}
size_t arm_get_meminfo() {
size_t get_mem_size() {
#ifdef LITE_WITH_LINUX
// get cpu count from /proc/cpuinfo
FILE* fp = fopen("/proc/meminfo", "rb");
if (!fp) {
return 1;
}
size_t memsize = 0;
char line[1024];
while (!feof(fp)) {
......@@ -589,52 +96,18 @@ size_t arm_get_meminfo() {
}
sscanf(s, "MemTotal: %d kB", &memsize);
}
fclose(fp);
return memsize;
#elif defined(TARGET_IOS)
// to be implemented
printf("not implemented\n");
return 0;
#endif
}
int arm_get_cpucount() {
#ifdef LITE_WITH_LINUX
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
int max_cpu_count = 20;
int count = 0;
for (int i = 0; i < max_cpu_count; ++i) {
char path[256];
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
FILE* fp = fopen(path, "rb");
if (!fp) {
break;
}
count++;
fclose(fp);
}
if (count < 1) {
count = 1;
}
return count;
#elif defined(TARGET_IOS)
int count = 0;
size_t len = sizeof(count);
sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
if (count < 1) {
count = 1;
}
return count;
#else
return 1;
#endif
return 0;
}
void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
#ifdef LITE_WITH_LINUX
void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
archs->clear();
#ifdef LITE_WITH_LINUX
//! get CPU ARCH
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
......@@ -668,6 +141,29 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
case 0xd0a:
archs->push_back(kA75);
break;
case 0xd40:
archs->push_back(kA76);
break;
case 0x804:
// 855
archs->push_back(kA76);
break;
case 0x805:
// 855
archs->push_back(kA55);
break;
case 0x802:
// 845
archs->push_back(kA75);
break;
case 0x803:
// 845
archs->push_back(kA55);
break;
case 0x801:
// 835
archs->push_back(kA73);
break;
case 0x800:
// 835
archs->push_back(kA73);
......@@ -677,49 +173,31 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
archs->push_back(kA72);
break;
default:
LOG(ERROR) << "unknow type";
LOG(ERROR) << "Unknow cpu arch: " << arch_id;
archs->push_back(kARMArch_UNKOWN);
}
}
}
fclose(fp);
int cpu_count = arm_get_cpucount();
if (archs->size() < cpu_count) {
for (int i = archs->size(); i < cpu_count; ++i) {
if (archs->size() < cpu_num) {
for (int i = archs->size(); i < cpu_num; ++i) {
archs->push_back(archs->at(i - 1));
}
}
#endif
#ifdef TARGET_IOS
int cpu_count = arm_get_cpucount();
for (int i = 0; i < cpu_count; ++i) {
#elif defined(TARGET_IOS)
for (int i = 0; i < cpu_num; ++i) {
archs->push_back(APPLE);
}
#else
for (int i = 0; i < cpu_num; ++i) {
archs->push_back(kARMArch_UNKOWN);
}
#endif
}
#ifdef LITE_WITH_LINUX
void set_default_cache(DeviceInfo* dev) {
int cpu_count = arm_get_cpucount();
dev->L1_cache_.resize(cpu_count);
dev->L2_cache_.resize(cpu_count);
dev->L3_cache_.resize(cpu_count);
#ifdef TARGET_IOS
for (int i = 0; i < cpu_count; ++i) {
dev->L1_cache_[i] = 64 * 1024;
dev->L2_cache_[i] = 2048 * 1024;
dev->L3_cache_[i] = 0;
}
#else
for (int i = 0; i < cpu_count; ++i) {
dev->L1_cache_[i] = 32 * 1024;
dev->L2_cache_[i] = 512 * 1024;
dev->L3_cache_[i] = 0;
}
#endif
}
std::string arm_get_cpu_name() {
std::string get_cpu_name() {
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
return "";
......@@ -739,122 +217,163 @@ std::string arm_get_cpu_name() {
return "";
}
int get_max_freq_khz(int cpuid) {
void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
*max_freq = 0;
*min_freq = 0;
// first try, for all possible cpu
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
FILE* fp = fopen(path, "rb");
if (!fp) {
// second try, for online cpu
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
cpuid);
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
// third try, for online cpu
// get max_freq
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
return -1;
return;
}
int max_freq_khz = -1;
fscanf(fp, "%d", &max_freq_khz);
fscanf(fp, "%d", max_freq);
fclose(fp);
return max_freq_khz;
// get min_freq
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
return;
}
fscanf(fp, "%d", min_freq);
fclose(fp);
return;
}
}
int max_freq_khz = 0;
*min_freq = std::numeric_limits<int>::max();
while (!feof(fp)) {
int freq_khz = 0;
int nscan = fscanf(fp, "%d %*d", &freq_khz);
int freq = 0;
int nscan = fscanf(fp, "%d %*d", &freq);
if (nscan != 1) {
break;
}
if (freq_khz > max_freq_khz) {
max_freq_khz = freq_khz;
if (freq > *max_freq) {
*max_freq = freq;
}
if (freq < *min_freq) {
*min_freq = freq;
}
}
fclose(fp);
return max_freq_khz;
}
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
const std::vector<int>& cpu_freq,
std::vector<int>* cluster_ids) {
if (cpu_count == 0) {
return 0;
void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
std::vector<int>* cpu_ids,
std::vector<int>* cluster_ids) {
int cpu_num = max_freqs.size();
if (cpu_num == 0) {
return;
}
cpuids->resize(cpu_count);
cluster_ids->resize(cpu_count);
for (int i = 0; i < cpu_count; i++) {
cpuids->at(i) = i;
cpu_ids->resize(cpu_num);
cluster_ids->resize(cpu_num);
for (int i = 0; i < cpu_num; i++) {
cpu_ids->at(i) = i;
}
// sort cpuid as big core first
// simple bubble sort
for (int i = 0; i < cpu_count; i++) {
for (int j = i + 1; j < cpu_count; j++) {
if (cpu_freq[i] < cpu_freq[j]) {
for (int i = 0; i < cpu_num; i++) {
for (int j = i + 1; j < cpu_num; j++) {
if (max_freqs[i] < max_freqs[j]) {
// swap
int tmp = cpuids->at(i);
cpuids->at(i) = cpuids->at(j);
cpuids->at(j) = tmp;
int tmp = cpu_ids->at(i);
cpu_ids->at(i) = cpu_ids->at(j);
cpu_ids->at(j) = tmp;
}
}
}
// SMP
int mid_max_freq_khz =
(cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
int mid_max_freq =
(max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;
for (int i = 0; i < cpu_count; i++) {
cpuids->at(i) = i;
if (cpu_freq[i] >= mid_max_freq_khz) {
for (int i = 0; i < cpu_num; i++) {
cpu_ids->at(i) = i;
if (max_freqs[i] >= mid_max_freq) {
cluster_ids->at(i) = 0;
} else {
cluster_ids->at(i) = 1;
}
}
return 0;
}
int check_online(const std::vector<int>& core_ids) {
if (core_ids.size() == 0) {
return 0;
void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
int* l3_cache_size) {
int max_cache_idx_num = 10;
*l1_cache_size = DEFAULT_L1_CACHE_SIZE;
*l2_cache_size = DEFAULT_L2_CACHE_SIZE;
*l3_cache_size = DEFAULT_L3_CACHE_SIZE;
for (int i = 0; i < max_cache_idx_num; i++) {
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
FILE* fp = fopen(path, "rb");
if (fp) {
int level = -1;
fscanf(fp, "%d", &level);
fclose(fp);
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
fp = fopen(path, "rb");
if (fp) {
int size = -1;
fscanf(fp, "%d", &size);
fclose(fp);
if (size >= 0) {
if (level == 1) {
*l1_cache_size = size * 1024;
} else if (level == 2) {
*l2_cache_size = size * 1024;
} else if (level == 3) {
*l3_cache_size = size * 1024;
}
}
}
}
}
}
bool check_cpu_online(const std::vector<int>& cpu_ids) {
if (cpu_ids.size() == 0) {
return false;
}
char path[256];
int online = 1;
for (int i = 0; i < core_ids.size(); ++i) {
bool all_online = true;
for (int i = 0; i < cpu_ids.size(); ++i) {
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
core_ids[i]);
cpu_ids[i]);
FILE* fp = fopen(path, "rb");
if (!fp) {
return 0;
int is_online = 0;
if (fp) {
fscanf(fp, "%d", &is_online);
fclose(fp);
} else {
LOG(ERROR) << "Failed to query the online statue of CPU id:"
<< cpu_ids[i];
}
if (is_online == 0) {
all_online = false;
LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
}
int cur_online = 0;
fscanf(fp, "%d", &cur_online);
online &= cur_online;
fclose(fp);
}
return online;
return all_online;
}
int set_sched_affinity(const std::vector<int>& cpuids) {
int set_sched_affinity(const std::vector<int>& cpu_ids) {
// #define CPU_SETSIZE 1024
// #define __NCPUBITS (8 * sizeof (unsigned long))
// typedef struct
......@@ -870,20 +389,569 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
#endif
cpu_set_t mask;
CPU_ZERO(&mask);
for (int i = 0; i < cpuids.size(); i++) {
CPU_SET(cpuids[i], &mask);
for (int i = 0; i < cpu_ids.size(); ++i) {
CPU_SET(cpu_ids[i], &mask);
}
int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
if (syscallret) {
LOG(ERROR) << "syscall error " << syscallret;
return -1;
}
return 0;
}
bool bind_threads(const std::vector<int> cpu_ids) {
#ifdef ARM_WITH_OMP
int thread_num = cpu_ids.size();
omp_set_num_threads(thread_num);
std::vector<int> ssarets;
for (int i = 0; i < thread_num; ++i) {
ssarets.push_back(0);
}
#pragma omp parallel for
for (int i = 0; i < thread_num; i++) {
ssarets[i] = set_sched_affinity(cpu_ids);
}
for (int i = 0; i < thread_num; i++) {
if (ssarets[i] != 0) {
LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
return false;
}
}
#else // ARM_WITH_OMP
std::vector<int> first_cpu_id;
first_cpu_id.push_back(cpu_ids[0]);
int ssaret = set_sched_affinity(first_cpu_id);
if (ssaret != 0) {
LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
return false;
}
#endif // ARM_WITH_OMP
}
#endif // LITE_WITH_LINUX
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
std::vector<int>* cache;
switch (cache_id) {
case 0:
cache = &L1_cache_;
break;
case 1:
cache = &L2_cache_;
break;
case 2:
cache = &L3_cache_;
break;
default:
break;
}
cache->resize(core_num_);
if (argc == 1) {
int cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < core_num_; ++i) {
(*cache)[i] = cache_size;
}
} else {
int big_core_num = big_core_ids_.size();
int little_core_num = little_core_ids_.size();
int big_core_cache_size = va_arg(arg_ptr, int);
int little_core_cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < big_core_num; ++i) {
(*cache)[big_core_ids_[i]] = big_core_cache_size;
}
for (int i = 0; i < little_core_num; ++i) {
(*cache)[little_core_ids_[i]] = little_core_cache_size;
}
}
va_end(arg_ptr);
}
void DeviceInfo::SetArchInfo(int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
archs_.resize(core_num_);
if (argc == 1) {
ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
for (int i = 0; i < core_num_; ++i) {
archs_[i] = arch;
}
} else {
ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
int big_core_num = big_core_ids_.size();
int little_core_num = little_core_ids_.size();
for (int i = 0; i < big_core_num; ++i) {
archs_[big_core_ids_[i]] = big_core_arch;
}
for (int i = 0; i < little_core_num; ++i) {
archs_[little_core_ids_[i]] = little_core_arch;
}
}
va_end(arg_ptr);
}
bool DeviceInfo::SetCPUInfoByName() {
/* Snapdragon */
if (dev_name_.find("SM8150") != std::string::npos) { // 855
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA76, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 2048 * 1024);
return true;
} else if (dev_name_.find("SDM845") != std::string::npos) { // 845
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA75, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 2048 * 1024);
return true;
} else if (dev_name_.find("SDM710") != std::string::npos) { // 710
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {6, 7};
little_core_ids_ = {0, 1, 2, 3, 4, 5};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA75, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8998") != std::string::npos) { // 835
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA73, kA53);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024,
/*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/
1024 * 1024);
return true;
} else if (dev_name_.find("MSM8996") != std::string::npos) { // 820
core_num_ = 4;
core_ids_ = {0, 1, 2, 3};
big_core_ids_ = {2, 3};
little_core_ids_ = {0, 1};
cluster_ids_ = {1, 1, 0, 0};
SetArchInfo(1, kA72);
SetCacheInfo(0, 1, 24 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("SDM660") != std::string::npos ||
dev_name_.find("SDM636") != std::string::npos) { // 660, 636
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(1, kA73);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8976") != std::string::npos) { // 652,653
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA72, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("MSM8953") != std::string::npos) { // 625
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
SetArchInfo(1, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8939") != std::string::npos) { // 615
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3};
little_core_ids_ = {4, 5, 6, 7};
cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
SetArchInfo(1, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
return true;
/* MediaTek */
} else if (dev_name_.find("MT6797") !=
std::string::npos) { // X20/X23/X25/X27
core_num_ = 10;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
big_core_ids_ = {8, 9};
little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA72, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("MT6799") != std::string::npos) { // X30
core_num_ = 10;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
big_core_ids_ = {8, 9};
little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA73, kA53);
return true;
} else if (dev_name_.find("MT6795") != std::string::npos ||
dev_name_.find("MT6762") != std::string::npos ||
dev_name_.find("MT6755T") != std::string::npos ||
dev_name_.find("MT6755S") != std::string::npos ||
dev_name_.find("MT6753") != std::string::npos ||
dev_name_.find("MT6752") != std::string::npos ||
dev_name_.find("MT6750") != std::string::npos) {
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
} else if (dev_name_.find("MT6758") != std::string::npos ||
dev_name_.find("MT6757") != std::string::npos ||
dev_name_.find("MT6763") != std::string::npos ||
dev_name_.find("MT6755M") != std::string::npos ||
dev_name_.find("MT6755") !=
std::string::npos) { // P30, P20/P25, P23, P10
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
} else if (dev_name_.find("MT6771") != std::string::npos) { // P60
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA73, kA53);
return true;
} else if (dev_name_.find("MT6765") != std::string::npos ||
dev_name_.find("MT6739") != std::string::npos ||
dev_name_.find("MT6738") != std::string::npos ||
dev_name_.find("MT6737") !=
std::string::npos) { // A22, MT6739, MT6738, MT6767
core_num_ = 4;
core_ids_ = {0, 1, 2, 3};
big_core_ids_ = {0, 1, 2, 3};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
}
return false;
}
void DeviceInfo::SetCPUInfoByProb() {
#ifdef LITE_WITH_LINUX
// get big.LITTLE cores by sorting CPU frequency
sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
big_core_ids_.clear();
little_core_ids_.clear();
for (int i = 0; i < cluster_ids_.size(); ++i) {
if (cluster_ids_[i] == 0) {
big_core_ids_.push_back(core_ids_[i]);
} else {
little_core_ids_.push_back(core_ids_[i]);
}
}
// get l1, l2, l3 cache size for each core
for (int i = 0; i < core_num_; i++) {
get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
}
#endif // LITE_WITH_LINUX
}
void DeviceInfo::RequestPowerFullMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
for (int i = 0; i < thread_num; ++i) {
if (i < big_core_size) {
active_ids_.push_back(big_core_ids_[i]);
} else if (i < big_core_size + little_core_size) {
active_ids_.push_back(little_core_ids_[i - big_core_size]);
}
}
mode_ = LITE_POWER_FULL;
}
void DeviceInfo::RequestPowerHighMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
if (thread_num > big_core_size) {
LOG(ERROR) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
<< ", truncate thread num to " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
if (thread_num > little_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerLowMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (little_core_size > 0) {
mode_ = LITE_POWER_LOW;
if (thread_num > little_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
<< ", truncate thread num to " << little_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(little_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(big_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
active_ids_.clear();
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(0);
}
mode_ = LITE_POWER_NO_BIND;
}
void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
if (thread_num > big_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
<< ", truncate thread num to " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
if (thread_num > little_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (little_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
if (thread_num > little_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
<< ", truncate thread num to " << little_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(
little_core_ids_[(i + shift_num) % little_core_size]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
}
}
int DeviceInfo::Setup() {
core_num_ = get_cpu_num();
mem_size_ = get_mem_size();
get_cpu_arch(&archs_, core_num_);
// set defalut CPU info
SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE);
SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE);
SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE);
#ifdef LITE_WITH_LINUX
// get max&min freq
max_freqs_.resize(core_num_);
min_freqs_.resize(core_num_);
for (int i = 0; i < core_num_; ++i) {
int max_freq, min_freq;
get_cpu_max_min_freq(i, &max_freq, &min_freq);
max_freqs_[i] = max_freq / 1000;
min_freqs_[i] = min_freq / 1000;
}
// get cache size and big.LITTLE core ids
dev_name_ = get_cpu_name();
if (!SetCPUInfoByName()) {
SetCPUInfoByProb();
}
// output info
LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
LOG(INFO) << "ARM multiprocessors number: " << core_num_;
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
<< ", max freq: " << max_freqs_[i]
<< ", min freq: " << min_freqs_[i]
<< ", cluster ID: " << cluster_ids_[core_ids_[i]]
<< ", CPU ARCH: A" << archs_[i];
}
LOG(INFO) << "L1 DataCache size is: ";
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << L1_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "L2 Cache size is: ";
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << L2_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "Total memory: " << mem_size_ << "KB";
#endif
// set default run mode
SetRunMode(LITE_POWER_NO_BIND, 1); // use single thread by default
return 0;
}
void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP
thread_num = std::min(thread_num, core_num_);
#else
thread_num = 1; // force thread_num to 1 if OpenMP is disabled
#endif
#ifdef LITE_WITH_LINUX
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
int big_little_core_size = big_core_size + little_core_size;
thread_num = std::min(thread_num, big_little_core_size);
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
RequestPowerFullMode(thread_num);
break;
case LITE_POWER_HIGH:
RequestPowerHighMode(thread_num);
break;
case LITE_POWER_LOW:
RequestPowerLowMode(thread_num);
break;
case LITE_POWER_NO_BIND:
RequestPowerNoBindMode(thread_num);
break;
case LITE_POWER_RAND_HIGH:
RequestPowerRandHighMode(shift_num, thread_num);
break;
case LITE_POWER_RAND_LOW:
RequestPowerRandLowMode(shift_num, thread_num);
break;
default:
LOG(FATAL) << "Unsupported power mode: " << mode;
break;
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
if (mode_ != LITE_POWER_NO_BIND) {
if (check_cpu_online(active_ids_)) {
bind_threads(active_ids_);
} else {
LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
mode_ = LITE_POWER_NO_BIND;
}
}
#else // LITE_WITH_LINUX
// only LITE_POWER_NO_BIND is supported in other OS
RequestPowerNoBindMode(thread_num);
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
#endif // LITE_WITH_LINUX
//! alloc memory for sgemm in this context
workspace_.Resize(
{static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
arch_ = archs_[active_ids_[0]];
}
void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
SetCacheInfo(0, l1size);
SetCacheInfo(1, l2size);
SetCacheInfo(2, l3size);
workspace_.Resize({2 * (l1size + l2size)});
}
bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false;
}
workspace_.Resize({static_cast<int64_t>(
count + L2_cache_[active_ids_[0]] / sizeof(float))});
return true;
}
#endif // LITE_WITH_ARM
......
......@@ -14,6 +14,7 @@
#pragma once
#include <cstdarg>
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/lite_tensor.h"
......@@ -47,92 +48,73 @@ typedef enum {
class DeviceInfo {
public:
int idx_;
int max_freq_;
int min_freq_;
int generate_arch_;
int compute_core_num_;
int max_memory_;
int sharemem_size_;
std::string device_name_;
std::string compute_ability_;
std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
ARMArch arch_;
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
static DeviceInfo& Global() {
static auto* x = new DeviceInfo;
return *x;
}
static void Init() {
auto& info = Global();
InitInternal(&info);
static int Init() {
static int ret = Global().Setup();
return ret;
}
void SetRunMode(PowerMode mode, int threads);
int Setup();
void SetRunMode(PowerMode mode, int thread_num);
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; }
void BindDev();
PowerMode mode() const { return mode_; }
int threads() const { return active_ids_.size(); }
ARMArch arch() const { return arch_; }
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
template <typename T>
T* workspace_data() {
return workspace_.mutable_data<T>();
}
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
bool ExtendWorkspace(DDimLite dims);
private:
DeviceInfo() = default;
static void InitInternal(DeviceInfo* dev);
};
size_t arm_get_meminfo();
int arm_get_cpucount();
void arm_get_cpu_arch(std::vector<ARMArch>* archs);
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
int core_num_;
std::vector<int> max_freqs_;
std::vector<int> min_freqs_;
int mem_size_;
std::string dev_name_;
#ifdef LITE_WITH_LINUX
void set_default_cache(DeviceInfo* dev);
std::string arm_get_cpu_name();
std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
int get_max_freq_khz(int cpuid);
ARMArch arch_;
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
const std::vector<int>& cpu_freq,
std::vector<int>* cluster_ids);
int check_online(const std::vector<int>& core_ids);
int set_sched_affinity(const std::vector<int>& cpuids);
void SetCacheInfo(int cache_id, int argc, ...);
void SetArchInfo(int argc, ...);
bool SetCPUInfoByName();
void SetCPUInfoByProb();
void RequestPowerFullMode(const int thread_num);
void RequestPowerHighMode(const int thread_num);
void RequestPowerLowMode(const int thread_num);
void RequestPowerNoBindMode(const int thread_num);
void RequestPowerRandHighMode(const int shift_num, const int thread_num);
void RequestPowerRandLowMode(const int shift_num, const int thread_num);
#endif // LITE_WITH_LINUX
DeviceInfo() = default;
};
#endif // LITE_WITH_ARM
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册