From ce46ef22d6efc2696bf677c61c33536a21b92a8f Mon Sep 17 00:00:00 2001 From: hong19860320 <9973393+hong19860320@users.noreply.github.com> Date: Thu, 20 Jun 2019 03:37:49 +0000 Subject: [PATCH] ARM cpu_info refine test=develop --- paddle/fluid/lite/core/context.h | 9 +- paddle/fluid/lite/core/cpu_info.cc | 1384 +++++++++++++++------------- paddle/fluid/lite/core/cpu_info.h | 104 +-- 3 files changed, 773 insertions(+), 724 deletions(-) diff --git a/paddle/fluid/lite/core/context.h b/paddle/fluid/lite/core/context.h index 81041dfc9..a79ce04fa 100644 --- a/paddle/fluid/lite/core/context.h +++ b/paddle/fluid/lite/core/context.h @@ -67,7 +67,7 @@ class Context { ARMContext& operator=(const ARMContext& ctx) {} // NOTE: InitOnce should only be used by ContextScheduler - void InitOnce() {} + void InitOnce() { DeviceInfo::Init(); } void CopyShared(const ARMContext* ctx) {} @@ -78,20 +78,19 @@ class Context { return DeviceInfo::Global().SetCache(l1size, l2size, l3size); } void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); } - void BindDev() { return DeviceInfo::Global().BindDev(); } PowerMode mode() const { return DeviceInfo::Global().mode(); } int threads() const { return DeviceInfo::Global().threads(); } ARMArch arch() const { return DeviceInfo::Global().arch(); } + int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); } + int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); } + int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); } template T* workspace_data() { return DeviceInfo::Global().workspace_data(); } - int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); } - int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); } - int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); } bool ExtendWorkspace(DDimLite dims) { return DeviceInfo::Global().ExtendWorkspace(dims); } diff --git a/paddle/fluid/lite/core/cpu_info.cc b/paddle/fluid/lite/core/cpu_info.cc index 52cd47fb5..40353631f 100644 --- a/paddle/fluid/lite/core/cpu_info.cc +++ b/paddle/fluid/lite/core/cpu_info.cc @@ -29,7 +29,8 @@ #include #endif -#include +#include +#include #include "paddle/fluid/lite/core/cpu_info.h" namespace paddle { @@ -37,549 +38,55 @@ namespace lite { #ifdef LITE_WITH_ARM -void DeviceInfo::InitInternal(DeviceInfo* dev) { - set_default_cache(dev); - dev->compute_core_num_ = arm_get_cpucount(); - dev->max_memory_ = arm_get_meminfo(); - -// get max freq -#ifdef LITE_WITH_LINUX - std::vector max_freq(dev->compute_core_num_); - for (int i = 0; i < dev->compute_core_num_; ++i) { - max_freq[i] = get_max_freq_khz(i) / 1000; - } - std::string cpu_name = arm_get_cpu_name(); - if (get_cpu_info_from_name(dev, cpu_name) != true) { - arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_, - max_freq, &dev->cluster_ids_); - dev->big_core_ids_.clear(); - dev->little_core_ids_.clear(); - for (int i = 0; i < dev->cluster_ids_.size(); ++i) { - if (dev->cluster_ids_[i] == 0) { - dev->big_core_ids_.push_back(dev->core_ids_[i]); - } else { - dev->little_core_ids_.push_back(dev->core_ids_[i]); - } - } - arm_get_cpu_arch(&dev->archs_); - } - - LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_; - for (int i = 0; i < dev->compute_core_num_; ++i) { - LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i] - << ", frequence: " << max_freq[i] - << ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]] - << ", CPU ARCH: A" << dev->archs_[i]; - } - VLOG(1) << "L1 DataCache size is: "; - for (int i = 0; i < dev->compute_core_num_; ++i) { - VLOG(1) << dev->L1_cache_[i] / 1024 << " KB"; - } - VLOG(1) << "L2 Cache size is: "; - for (int i = 0; i < dev->compute_core_num_; ++i) { - VLOG(1) << dev->L2_cache_[i] / 1024 << " KB"; - } - VLOG(1) << "Total memory: " << dev->max_memory_ << "KB"; - - dev->max_freq_ = max_freq[0]; - for (int j = 1; j < dev->compute_core_num_; ++j) { - if (dev->max_freq_ < max_freq[j]) { - dev->max_freq_ = max_freq[j]; - } - } -#elif defined(TARGET_IOS) - arm_get_cpu_arch(&dev->archs_); -#endif - dev->active_ids_ = {0}; - dev->mode_ = LITE_POWER_HIGH; - dev->workspace_.Resize({static_cast( - dev->L2_cache_[dev->active_ids_[0]] / sizeof(float))}); #ifdef TARGET_IOS - dev->arch_ = APPLE; // use 6x8 +const int DEFAULT_L1_CACHE_SIZE = 64 * 1024; +const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024; +const int DEFAULT_L3_CACHE_SIZE = 0; #else - if (dev->big_core_ids_.size() > 0) { - dev->arch_ = dev->archs_[dev->big_core_ids_[0]]; - } +const int DEFAULT_L1_CACHE_SIZE = 32 * 1024; +const int DEFAULT_L2_CACHE_SIZE = 512 * 1024; +const int DEFAULT_L3_CACHE_SIZE = 0; #endif -} -void DeviceInfo::SetCache(int l1size, int l2size, int l3size) { - int cpu_count = arm_get_cpucount(); - L1_cache_.resize(cpu_count); - L2_cache_.resize(cpu_count); - L3_cache_.resize(cpu_count); - for (int i = 0; i < cpu_count; ++i) { - L1_cache_[i] = l1size; - L2_cache_[i] = l2size; - L3_cache_[i] = l3size; - } - workspace_.Resize({2 * (l1size + l2size)}); -} - -void DeviceInfo::BindDev() { -#ifdef ARM_WITH_OMP - int num_threads = active_ids_.size(); - omp_set_num_threads(num_threads); +int get_cpu_num() { #ifdef LITE_WITH_LINUX - std::vector ssarets; - for (int j = 0; j < num_threads; ++j) { - ssarets.push_back(0); - } -#pragma omp parallel for - for (int i = 0; i < num_threads; i++) { - ssarets[i] = set_sched_affinity(active_ids_); - } - for (int i = 0; i < num_threads; i++) { - if (ssarets[i] != 0) { - LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i]; - return; + // get cpu count from /sys/devices/system/cpu/cpunum/uevent + int max_cpu_num = 20; + int cpu_num = 0; + for (int i = 0; i < max_cpu_num; ++i) { + char path[256]; + snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); + FILE* fp = fopen(path, "rb"); + if (!fp) { + break; } + cpu_num++; + fclose(fp); } -#endif // LITE_WITH_LINUX -#else // ARM_WITH_OMP -#ifdef LITE_WITH_LINUX - std::vector cpuid1; - cpuid1.push_back(active_ids_[0]); - int ssaret = set_sched_affinity(cpuid1); - if (ssaret != 0) { - printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]); - return; - } -#endif // LITE_WITH_LINUX -#endif // ARM_WITH_OMP -} - -void DeviceInfo::SetRunMode(PowerMode mode, int threads) { - int big_core_size = big_core_ids_.size(); - int small_core_size = little_core_ids_.size(); - if (threads > big_core_size + small_core_size) { - threads = big_core_size + small_core_size; + if (cpu_num < 1) { + cpu_num = 1; } -#ifdef ARM_WITH_OMP - count_++; - int shift_num = (count_ / 10) % big_core_size; - switch (mode) { - case LITE_POWER_FULL: - mode_ = mode; - active_ids_.clear(); - for (int i = 0; i < threads; ++i) { - if (i < big_core_size) { - active_ids_.push_back(big_core_ids_[i]); - } else { - active_ids_.push_back(little_core_ids_[i - big_core_size]); - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - case LITE_POWER_HIGH: - active_ids_.clear(); - if (big_core_size > 0) { - mode_ = LITE_POWER_HIGH; - if (threads > big_core_size) { - LOG(ERROR) << "threads: " << threads - << ", exceed the big cores size: " << big_core_size; - active_ids_ = big_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(big_core_ids_[i]); - } - } - } else { - mode_ = LITE_POWER_LOW; - LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores."; - if (threads > small_core_size) { - active_ids_ = little_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(little_core_ids_[i]); - } - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - case LITE_POWER_LOW: - active_ids_.clear(); - if (small_core_size > 0) { - mode_ = LITE_POWER_LOW; - if (threads > small_core_size) { - LOG(WARNING) << "threads: " << threads - << ", exceed the little cores size: " << small_core_size; - active_ids_ = little_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(little_core_ids_[i]); - } - } - } else { - mode_ = LITE_POWER_HIGH; - LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores"; - if (threads > big_core_size) { - active_ids_ = big_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(big_core_ids_[i]); - } - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - case LITE_POWER_NO_BIND: - mode_ = LITE_POWER_NO_BIND; - active_ids_.clear(); - if (threads > core_ids_.size()) { - active_ids_.resize(core_ids_.size()); - } else { - active_ids_.resize(threads); - } - break; - case LITE_POWER_RAND_HIGH: - active_ids_.clear(); - if (big_core_size > 0) { - mode_ = LITE_POWER_RAND_HIGH; - if (threads > big_core_size) { - LOG(WARNING) << "threads: " << threads - << ", exceed the big cores size: " << big_core_size; - active_ids_ = big_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back( - big_core_ids_[(i + shift_num) % big_core_size]); - } - } - } else { - mode_ = LITE_POWER_LOW; - LOG(WARNING) - << "HIGH POWER MODE is not support, switch to little cores."; - if (threads > small_core_size) { - active_ids_ = little_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(little_core_ids_[i]); - } - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - case LITE_POWER_RAND_LOW: - active_ids_.clear(); - if (small_core_size > 0) { - mode_ = LITE_POWER_RAND_LOW; - if (threads > small_core_size) { - LOG(WARNING) << "threads: " << threads - << ", exceed the little cores size: " << small_core_size; - active_ids_ = little_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back( - little_core_ids_[(i + shift_num) % small_core_size]); - } - } - } else { - mode_ = LITE_POWER_HIGH; - LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores."; - if (threads > big_core_size) { - active_ids_ = big_core_ids_; - } else { - for (int i = 0; i < threads; ++i) { - active_ids_.push_back(big_core_ids_[i]); - } - } - } - if (active_ids_.size() == 0) { - active_ids_.push_back(0); - } - break; - } - //! fix multi-threads LITE_POWER_HIGH mode - if (mode_ == LITE_POWER_NO_BIND || threads > 1) { - int threads = active_ids_.size(); - omp_set_num_threads(threads); - } else { - if (check_online(active_ids_)) { - BindDev(); - } else { - LOG(WARNING) << "core id " << active_ids_[0] - << " is offline, switch to NO BIND MODE"; - int threads = active_ids_.size(); - omp_set_num_threads(threads); - } + return cpu_num; +#elif defined(TARGET_IOS) + int cpu_num = 0; + size_t len = sizeof(cpu_num); + sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0); + if (cpu_num < 1) { + cpu_num = 1; } + return cpu_num; #else - if (big_core_size > 0) { - active_ids_ = {big_core_ids_[0]}; - } else { - active_ids_ = {0}; - } + return 1; #endif - //! alloc memory for sgemm in this context - int temp_mem_size = L2_cache_[active_ids_[0]] / sizeof(float); - workspace_.Resize({temp_mem_size}); - arch_ = archs_[active_ids_[0]]; -} - -bool DeviceInfo::ExtendWorkspace(DDimLite dims) { - auto count = dims.product(); - auto old = workspace_.dims(); - if (count == old.product()) { - return false; - } - - workspace_.Resize({static_cast( - count + L2_cache_[active_ids_[0]] / sizeof(float))}); - return true; -} - -// cache_id : 0 -> L1, 1 -> L2, 2 -> L3 -void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - std::vector* cache; - switch (cache_id) { - case 0: - cache = &cpu_info->L1_cache_; - break; - case 1: - cache = &cpu_info->L2_cache_; - break; - case 2: - cache = &cpu_info->L3_cache_; - break; - default: - break; - } - int core_num = cpu_info->compute_core_num_; - cache->resize(core_num); - if (argc == 1) { - int cache_size = va_arg(arg_ptr, int); - for (int i = 0; i < core_num; ++i) { - (*cache)[i] = cache_size; - } - } else { - int big_core_num = cpu_info->big_core_ids_.size(); - int little_core_num = cpu_info->little_core_ids_.size(); - int big_core_cache_size = va_arg(arg_ptr, int); - int little_core_cache_size = va_arg(arg_ptr, int); - for (int i = 0; i < big_core_num; ++i) { - (*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size; - } - for (int i = 0; i < little_core_num; ++i) { - (*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size; - } - } - va_end(arg_ptr); -} - -void set_arch_info(DeviceInfo* cpu_info, int argc, ...) { - va_list arg_ptr; - va_start(arg_ptr, argc); - int core_num = cpu_info->compute_core_num_; - cpu_info->archs_.resize(core_num); - if (argc == 1) { - ARMArch arch = (ARMArch)va_arg(arg_ptr, int); - for (int i = 0; i < core_num; ++i) { - cpu_info->archs_[i] = arch; - } - } else { - ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int); - ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int); - int big_core_num = cpu_info->big_core_ids_.size(); - int little_core_num = cpu_info->little_core_ids_.size(); - for (int i = 0; i < big_core_num; ++i) { - cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch; - } - for (int i = 0; i < little_core_num; ++i) { - cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch; - } - } - va_end(arg_ptr); } -bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) { - /* Snapdragon */ - if (hardware_name.find("SDM845") != std::string::npos) { // 845 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 2, kA75, kA55); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024); - set_cache_info(cpu_info, 2, 1, 2048 * 1024); - return true; - - } else if (hardware_name.find("SDM710") != std::string::npos) { // 710 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0}; - set_arch_info(cpu_info, 2, kA75, kA55); - return true; - } else if (hardware_name.find("MSM8998") != std::string::npos) { // 835 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 2, kA73, kA53); - set_cache_info(cpu_info, 0, 2, 64 * 1024); - set_cache_info(cpu_info, 1, 2, 1024 * 1024, - /*real cache size is 2M, while that will get bad performace - on conv3x3s1 or gemm, set to 1M or 512K*/ - 1024 * 1024); - return true; - - } else if (hardware_name.find("MSM8996") != std::string::npos) { // 820 - cpu_info->compute_core_num_ = 4; - cpu_info->core_ids_ = {0, 1, 2, 3}; - cpu_info->big_core_ids_ = {2, 3}; - cpu_info->little_core_ids_ = {0, 1}; - cpu_info->cluster_ids_ = {1, 1, 0, 0}; - set_arch_info(cpu_info, 1, kA72); - set_cache_info(cpu_info, 0, 1, 24 * 1024); - set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); - return true; - - } else if (hardware_name.find("SDM660") != std::string::npos || - hardware_name.find("SDM636") != std::string::npos) { // 660, 636 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA73); - set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024); - set_cache_info(cpu_info, 1, 1, 1024 * 1024); - return true; - - } else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 2, kA72, kA53); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); - return true; - - } else if (hardware_name.find("MSM8953") != std::string::npos) { // 625 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->little_core_ids_ = {}; - cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA53); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 1, 1024 * 1024); - return true; - - } else if (hardware_name.find("MSM8939") != std::string::npos) { // 615 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {0, 1, 2, 3}; - cpu_info->little_core_ids_ = {4, 5, 6, 7}; - cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1}; - set_arch_info(cpu_info, 1, kA53); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024); - return true; - - /* MediaTek */ - - } else if (hardware_name.find("MT6797") != - std::string::npos) { // X20/X23/X25/X27 - cpu_info->compute_core_num_ = 10; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - cpu_info->big_core_ids_ = {8, 9}; - cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; - set_arch_info(cpu_info, 2, kA72, kA53); - set_cache_info(cpu_info, 0, 1, 32 * 1024); - set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024); - return true; - - } else if (hardware_name.find("MT6799") != std::string::npos) { // X30 - cpu_info->compute_core_num_ = 10; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; - cpu_info->big_core_ids_ = {8, 9}; - cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; - set_arch_info(cpu_info, 2, kA73, kA53); - return true; - - } else if (hardware_name.find("MT6795") != std::string::npos || - hardware_name.find("MT6762") != std::string::npos || - hardware_name.find("MT6755T") != std::string::npos || - hardware_name.find("MT6755S") != std::string::npos || - hardware_name.find("MT6753") != std::string::npos || - hardware_name.find("MT6752") != std::string::npos || - hardware_name.find("MT6750") != std::string::npos) { - // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->little_core_ids_ = {}; - cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA53); - return true; - - } else if (hardware_name.find("MT6758") != std::string::npos || - hardware_name.find("MT6757") != std::string::npos || - hardware_name.find("MT6763") != std::string::npos || - hardware_name.find("MT6755M") != std::string::npos || - hardware_name.find("MT6755") != - std::string::npos) { // P30, P20/P25, P23, P10 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA53); - return true; - - } else if (hardware_name.find("MT6771") != std::string::npos) { // P60 - cpu_info->compute_core_num_ = 8; - cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; - cpu_info->big_core_ids_ = {4, 5, 6, 7}; - cpu_info->little_core_ids_ = {0, 1, 2, 3}; - cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; - set_arch_info(cpu_info, 2, kA73, kA53); - return true; - - } else if (hardware_name.find("MT6765") != std::string::npos || - hardware_name.find("MT6739") != std::string::npos || - hardware_name.find("MT6738") != std::string::npos || - hardware_name.find("MT6737") != - std::string::npos) { // A22, MT6739, MT6738, MT6767 - cpu_info->compute_core_num_ = 4; - cpu_info->core_ids_ = {0, 1, 2, 3}; - cpu_info->big_core_ids_ = {0, 0, 0, 0}; - cpu_info->little_core_ids_ = {}; - cpu_info->cluster_ids_ = {0, 0, 0, 0}; - set_arch_info(cpu_info, 1, kA53); - return true; - } - return false; -} - -size_t arm_get_meminfo() { +size_t get_mem_size() { #ifdef LITE_WITH_LINUX // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/meminfo", "rb"); if (!fp) { return 1; } - size_t memsize = 0; char line[1024]; while (!feof(fp)) { @@ -589,52 +96,18 @@ size_t arm_get_meminfo() { } sscanf(s, "MemTotal: %d kB", &memsize); } - fclose(fp); - return memsize; #elif defined(TARGET_IOS) // to be implemented printf("not implemented\n"); - return 0; -#endif -} - -int arm_get_cpucount() { -#ifdef LITE_WITH_LINUX - // get cpu count from /sys/devices/system/cpu/cpunum/uevent - int max_cpu_count = 20; - int count = 0; - for (int i = 0; i < max_cpu_count; ++i) { - char path[256]; - snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i); - FILE* fp = fopen(path, "rb"); - if (!fp) { - break; - } - count++; - fclose(fp); - } - if (count < 1) { - count = 1; - } - return count; -#elif defined(TARGET_IOS) - int count = 0; - size_t len = sizeof(count); - sysctlbyname("hw.ncpu", &count, &len, NULL, 0); - if (count < 1) { - count = 1; - } - return count; -#else - return 1; #endif + return 0; } -void arm_get_cpu_arch(std::vector* archs) { -#ifdef LITE_WITH_LINUX +void get_cpu_arch(std::vector* archs, const int cpu_num) { archs->clear(); +#ifdef LITE_WITH_LINUX //! get CPU ARCH FILE* fp = fopen("/proc/cpuinfo", "rb"); if (!fp) { @@ -668,6 +141,29 @@ void arm_get_cpu_arch(std::vector* archs) { case 0xd0a: archs->push_back(kA75); break; + case 0xd40: + archs->push_back(kA76); + break; + case 0x804: + // 855 + archs->push_back(kA76); + break; + case 0x805: + // 855 + archs->push_back(kA55); + break; + case 0x802: + // 845 + archs->push_back(kA75); + break; + case 0x803: + // 845 + archs->push_back(kA55); + break; + case 0x801: + // 835 + archs->push_back(kA73); + break; case 0x800: // 835 archs->push_back(kA73); @@ -677,49 +173,31 @@ void arm_get_cpu_arch(std::vector* archs) { archs->push_back(kA72); break; default: - LOG(ERROR) << "unknow type"; + LOG(ERROR) << "Unknow cpu arch: " << arch_id; archs->push_back(kARMArch_UNKOWN); } } } fclose(fp); - int cpu_count = arm_get_cpucount(); - if (archs->size() < cpu_count) { - for (int i = archs->size(); i < cpu_count; ++i) { + if (archs->size() < cpu_num) { + for (int i = archs->size(); i < cpu_num; ++i) { archs->push_back(archs->at(i - 1)); } } -#endif -#ifdef TARGET_IOS - int cpu_count = arm_get_cpucount(); - for (int i = 0; i < cpu_count; ++i) { +#elif defined(TARGET_IOS) + for (int i = 0; i < cpu_num; ++i) { archs->push_back(APPLE); } +#else + for (int i = 0; i < cpu_num; ++i) { + archs->push_back(kARMArch_UNKOWN); + } #endif } #ifdef LITE_WITH_LINUX -void set_default_cache(DeviceInfo* dev) { - int cpu_count = arm_get_cpucount(); - dev->L1_cache_.resize(cpu_count); - dev->L2_cache_.resize(cpu_count); - dev->L3_cache_.resize(cpu_count); -#ifdef TARGET_IOS - for (int i = 0; i < cpu_count; ++i) { - dev->L1_cache_[i] = 64 * 1024; - dev->L2_cache_[i] = 2048 * 1024; - dev->L3_cache_[i] = 0; - } -#else - for (int i = 0; i < cpu_count; ++i) { - dev->L1_cache_[i] = 32 * 1024; - dev->L2_cache_[i] = 512 * 1024; - dev->L3_cache_[i] = 0; - } -#endif -} -std::string arm_get_cpu_name() { +std::string get_cpu_name() { FILE* fp = fopen("/proc/cpuinfo", "rb"); if (!fp) { return ""; @@ -739,122 +217,163 @@ std::string arm_get_cpu_name() { return ""; } -int get_max_freq_khz(int cpuid) { +void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) { + *max_freq = 0; + *min_freq = 0; // first try, for all possible cpu char path[256]; snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid); - + "/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id); FILE* fp = fopen(path, "rb"); - if (!fp) { // second try, for online cpu snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state", - cpuid); + cpu_id); fp = fopen(path, "rb"); - if (!fp) { // third try, for online cpu + // get max_freq snprintf(path, sizeof(path), - "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid); + "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", + cpu_id); fp = fopen(path, "rb"); - if (!fp) { - return -1; + return; } - - int max_freq_khz = -1; - fscanf(fp, "%d", &max_freq_khz); - + fscanf(fp, "%d", max_freq); fclose(fp); - - return max_freq_khz; + // get min_freq + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq", + cpu_id); + fp = fopen(path, "rb"); + if (!fp) { + return; + } + fscanf(fp, "%d", min_freq); + fclose(fp); + return; } } - - int max_freq_khz = 0; + *min_freq = std::numeric_limits::max(); while (!feof(fp)) { - int freq_khz = 0; - int nscan = fscanf(fp, "%d %*d", &freq_khz); + int freq = 0; + int nscan = fscanf(fp, "%d %*d", &freq); if (nscan != 1) { break; } - - if (freq_khz > max_freq_khz) { - max_freq_khz = freq_khz; + if (freq > *max_freq) { + *max_freq = freq; + } + if (freq < *min_freq) { + *min_freq = freq; } } - fclose(fp); - - return max_freq_khz; } -int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector* cpuids, - const std::vector& cpu_freq, - std::vector* cluster_ids) { - if (cpu_count == 0) { - return 0; +void sort_cpuid_by_max_freq(const std::vector& max_freqs, + std::vector* cpu_ids, + std::vector* cluster_ids) { + int cpu_num = max_freqs.size(); + if (cpu_num == 0) { + return; } - - cpuids->resize(cpu_count); - cluster_ids->resize(cpu_count); - - for (int i = 0; i < cpu_count; i++) { - cpuids->at(i) = i; + cpu_ids->resize(cpu_num); + cluster_ids->resize(cpu_num); + for (int i = 0; i < cpu_num; i++) { + cpu_ids->at(i) = i; } - // sort cpuid as big core first // simple bubble sort - - for (int i = 0; i < cpu_count; i++) { - for (int j = i + 1; j < cpu_count; j++) { - if (cpu_freq[i] < cpu_freq[j]) { + for (int i = 0; i < cpu_num; i++) { + for (int j = i + 1; j < cpu_num; j++) { + if (max_freqs[i] < max_freqs[j]) { // swap - int tmp = cpuids->at(i); - cpuids->at(i) = cpuids->at(j); - cpuids->at(j) = tmp; + int tmp = cpu_ids->at(i); + cpu_ids->at(i) = cpu_ids->at(j); + cpu_ids->at(j) = tmp; } } } // SMP - int mid_max_freq_khz = - (cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2; + int mid_max_freq = + (max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2; - for (int i = 0; i < cpu_count; i++) { - cpuids->at(i) = i; - if (cpu_freq[i] >= mid_max_freq_khz) { + for (int i = 0; i < cpu_num; i++) { + cpu_ids->at(i) = i; + if (max_freqs[i] >= mid_max_freq) { cluster_ids->at(i) = 0; } else { cluster_ids->at(i) = 1; } } - return 0; } -int check_online(const std::vector& core_ids) { - if (core_ids.size() == 0) { - return 0; +void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size, + int* l3_cache_size) { + int max_cache_idx_num = 10; + *l1_cache_size = DEFAULT_L1_CACHE_SIZE; + *l2_cache_size = DEFAULT_L2_CACHE_SIZE; + *l3_cache_size = DEFAULT_L3_CACHE_SIZE; + for (int i = 0; i < max_cache_idx_num; i++) { + char path[256]; + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i); + FILE* fp = fopen(path, "rb"); + if (fp) { + int level = -1; + fscanf(fp, "%d", &level); + fclose(fp); + snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i); + fp = fopen(path, "rb"); + if (fp) { + int size = -1; + fscanf(fp, "%d", &size); + fclose(fp); + if (size >= 0) { + if (level == 1) { + *l1_cache_size = size * 1024; + } else if (level == 2) { + *l2_cache_size = size * 1024; + } else if (level == 3) { + *l3_cache_size = size * 1024; + } + } + } + } + } +} + +bool check_cpu_online(const std::vector& cpu_ids) { + if (cpu_ids.size() == 0) { + return false; } char path[256]; - int online = 1; - for (int i = 0; i < core_ids.size(); ++i) { + bool all_online = true; + for (int i = 0; i < cpu_ids.size(); ++i) { snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online", - core_ids[i]); + cpu_ids[i]); FILE* fp = fopen(path, "rb"); - if (!fp) { - return 0; + int is_online = 0; + if (fp) { + fscanf(fp, "%d", &is_online); + fclose(fp); + } else { + LOG(ERROR) << "Failed to query the online statue of CPU id:" + << cpu_ids[i]; + } + if (is_online == 0) { + all_online = false; + LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine"; } - int cur_online = 0; - fscanf(fp, "%d", &cur_online); - online &= cur_online; - fclose(fp); } - return online; + return all_online; } -int set_sched_affinity(const std::vector& cpuids) { +int set_sched_affinity(const std::vector& cpu_ids) { // #define CPU_SETSIZE 1024 // #define __NCPUBITS (8 * sizeof (unsigned long)) // typedef struct @@ -870,20 +389,569 @@ int set_sched_affinity(const std::vector& cpuids) { #endif cpu_set_t mask; CPU_ZERO(&mask); - for (int i = 0; i < cpuids.size(); i++) { - CPU_SET(cpuids[i], &mask); + for (int i = 0; i < cpu_ids.size(); ++i) { + CPU_SET(cpu_ids[i], &mask); } - int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask); if (syscallret) { - LOG(ERROR) << "syscall error " << syscallret; return -1; } + return 0; +} + +bool bind_threads(const std::vector cpu_ids) { +#ifdef ARM_WITH_OMP + int thread_num = cpu_ids.size(); + omp_set_num_threads(thread_num); + std::vector ssarets; + for (int i = 0; i < thread_num; ++i) { + ssarets.push_back(0); + } +#pragma omp parallel for + for (int i = 0; i < thread_num; i++) { + ssarets[i] = set_sched_affinity(cpu_ids); + } + for (int i = 0; i < thread_num; i++) { + if (ssarets[i] != 0) { + LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i]; + return false; + } + } +#else // ARM_WITH_OMP + std::vector first_cpu_id; + first_cpu_id.push_back(cpu_ids[0]); + int ssaret = set_sched_affinity(first_cpu_id); + if (ssaret != 0) { + LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0]; + return false; + } +#endif // ARM_WITH_OMP +} + +#endif // LITE_WITH_LINUX + +// cache_id : 0 -> L1, 1 -> L2, 2 -> L3 +void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) { + va_list arg_ptr; + va_start(arg_ptr, argc); + std::vector* cache; + switch (cache_id) { + case 0: + cache = &L1_cache_; + break; + case 1: + cache = &L2_cache_; + break; + case 2: + cache = &L3_cache_; + break; + default: + break; + } + cache->resize(core_num_); + if (argc == 1) { + int cache_size = va_arg(arg_ptr, int); + for (int i = 0; i < core_num_; ++i) { + (*cache)[i] = cache_size; + } + } else { + int big_core_num = big_core_ids_.size(); + int little_core_num = little_core_ids_.size(); + int big_core_cache_size = va_arg(arg_ptr, int); + int little_core_cache_size = va_arg(arg_ptr, int); + for (int i = 0; i < big_core_num; ++i) { + (*cache)[big_core_ids_[i]] = big_core_cache_size; + } + for (int i = 0; i < little_core_num; ++i) { + (*cache)[little_core_ids_[i]] = little_core_cache_size; + } + } + va_end(arg_ptr); +} + +void DeviceInfo::SetArchInfo(int argc, ...) { + va_list arg_ptr; + va_start(arg_ptr, argc); + archs_.resize(core_num_); + if (argc == 1) { + ARMArch arch = (ARMArch)va_arg(arg_ptr, int); + for (int i = 0; i < core_num_; ++i) { + archs_[i] = arch; + } + } else { + ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int); + ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int); + int big_core_num = big_core_ids_.size(); + int little_core_num = little_core_ids_.size(); + for (int i = 0; i < big_core_num; ++i) { + archs_[big_core_ids_[i]] = big_core_arch; + } + for (int i = 0; i < little_core_num; ++i) { + archs_[little_core_ids_[i]] = little_core_arch; + } + } + va_end(arg_ptr); +} + +bool DeviceInfo::SetCPUInfoByName() { + /* Snapdragon */ + if (dev_name_.find("SM8150") != std::string::npos) { // 855 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA76, kA55); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); + SetCacheInfo(2, 1, 2048 * 1024); + return true; + } else if (dev_name_.find("SDM845") != std::string::npos) { // 845 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA75, kA55); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); + SetCacheInfo(2, 1, 2048 * 1024); + return true; + } else if (dev_name_.find("SDM710") != std::string::npos) { // 710 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {6, 7}; + little_core_ids_ = {0, 1, 2, 3, 4, 5}; + cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0}; + SetArchInfo(2, kA75, kA55); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 2, 256 * 1024, 128 * 1024); + SetCacheInfo(2, 1, 1024 * 1024); + return true; + } else if (dev_name_.find("MSM8998") != std::string::npos) { // 835 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA73, kA53); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 2, 1024 * 1024, + /*real cache size is 2M, while that will get bad performace + on conv3x3s1 or gemm, set to 1M or 512K*/ + 1024 * 1024); + return true; + } else if (dev_name_.find("MSM8996") != std::string::npos) { // 820 + core_num_ = 4; + core_ids_ = {0, 1, 2, 3}; + big_core_ids_ = {2, 3}; + little_core_ids_ = {0, 1}; + cluster_ids_ = {1, 1, 0, 0}; + SetArchInfo(1, kA72); + SetCacheInfo(0, 1, 24 * 1024); + SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); + return true; + } else if (dev_name_.find("SDM660") != std::string::npos || + dev_name_.find("SDM636") != std::string::npos) { // 660, 636 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(1, kA73); + SetCacheInfo(0, 2, 64 * 1024, 32 * 1024); + SetCacheInfo(1, 1, 1024 * 1024); + return true; + } else if (dev_name_.find("MSM8976") != std::string::npos) { // 652,653 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA72, kA53); + SetCacheInfo(0, 1, 32 * 1024); + SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); + return true; + } else if (dev_name_.find("MSM8953") != std::string::npos) { // 625 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + little_core_ids_ = {}; + cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; + SetArchInfo(1, kA53); + SetCacheInfo(0, 1, 32 * 1024); + SetCacheInfo(1, 1, 1024 * 1024); + return true; + } else if (dev_name_.find("MSM8939") != std::string::npos) { // 615 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {0, 1, 2, 3}; + little_core_ids_ = {4, 5, 6, 7}; + cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1}; + SetArchInfo(1, kA53); + SetCacheInfo(0, 1, 32 * 1024); + SetCacheInfo(1, 2, 512 * 1024, 256 * 1024); + return true; + /* MediaTek */ + } else if (dev_name_.find("MT6797") != + std::string::npos) { // X20/X23/X25/X27 + core_num_ = 10; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + big_core_ids_ = {8, 9}; + little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; + SetArchInfo(2, kA72, kA53); + SetCacheInfo(0, 1, 32 * 1024); + SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024); + return true; + } else if (dev_name_.find("MT6799") != std::string::npos) { // X30 + core_num_ = 10; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + big_core_ids_ = {8, 9}; + little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0}; + SetArchInfo(2, kA73, kA53); + return true; + } else if (dev_name_.find("MT6795") != std::string::npos || + dev_name_.find("MT6762") != std::string::npos || + dev_name_.find("MT6755T") != std::string::npos || + dev_name_.find("MT6755S") != std::string::npos || + dev_name_.find("MT6753") != std::string::npos || + dev_name_.find("MT6752") != std::string::npos || + dev_name_.find("MT6750") != std::string::npos) { + // X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + little_core_ids_ = {}; + cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0}; + SetArchInfo(1, kA53); + return true; + } else if (dev_name_.find("MT6758") != std::string::npos || + dev_name_.find("MT6757") != std::string::npos || + dev_name_.find("MT6763") != std::string::npos || + dev_name_.find("MT6755M") != std::string::npos || + dev_name_.find("MT6755") != + std::string::npos) { // P30, P20/P25, P23, P10 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(1, kA53); + return true; + } else if (dev_name_.find("MT6771") != std::string::npos) { // P60 + core_num_ = 8; + core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7}; + big_core_ids_ = {4, 5, 6, 7}; + little_core_ids_ = {0, 1, 2, 3}; + cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0}; + SetArchInfo(2, kA73, kA53); + return true; + } else if (dev_name_.find("MT6765") != std::string::npos || + dev_name_.find("MT6739") != std::string::npos || + dev_name_.find("MT6738") != std::string::npos || + dev_name_.find("MT6737") != + std::string::npos) { // A22, MT6739, MT6738, MT6767 + core_num_ = 4; + core_ids_ = {0, 1, 2, 3}; + big_core_ids_ = {0, 1, 2, 3}; + little_core_ids_ = {}; + cluster_ids_ = {0, 0, 0, 0}; + SetArchInfo(1, kA53); + return true; + } + return false; +} + +void DeviceInfo::SetCPUInfoByProb() { +#ifdef LITE_WITH_LINUX + // get big.LITTLE cores by sorting CPU frequency + sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_); + big_core_ids_.clear(); + little_core_ids_.clear(); + for (int i = 0; i < cluster_ids_.size(); ++i) { + if (cluster_ids_[i] == 0) { + big_core_ids_.push_back(core_ids_[i]); + } else { + little_core_ids_.push_back(core_ids_[i]); + } + } + // get l1, l2, l3 cache size for each core + for (int i = 0; i < core_num_; i++) { + get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i])); + } +#endif // LITE_WITH_LINUX +} + +void DeviceInfo::RequestPowerFullMode(const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + active_ids_.clear(); + for (int i = 0; i < thread_num; ++i) { + if (i < big_core_size) { + active_ids_.push_back(big_core_ids_[i]); + } else if (i < big_core_size + little_core_size) { + active_ids_.push_back(little_core_ids_[i - big_core_size]); + } + } + mode_ = LITE_POWER_FULL; +} + +void DeviceInfo::RequestPowerHighMode(const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + active_ids_.clear(); + if (big_core_size > 0) { + mode_ = LITE_POWER_HIGH; + if (thread_num > big_core_size) { + LOG(ERROR) << "Request thread num: " << thread_num + << ", exceed the big cores size: " << big_core_size + << ", truncate thread num to " << big_core_size; + active_ids_ = big_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(big_core_ids_[i]); + } + } + } else { + mode_ = LITE_POWER_LOW; + LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores."; + if (thread_num > little_core_size) { + active_ids_ = little_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(little_core_ids_[i]); + } + } + } +} + +void DeviceInfo::RequestPowerLowMode(const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + active_ids_.clear(); + if (little_core_size > 0) { + mode_ = LITE_POWER_LOW; + if (thread_num > little_core_size) { + LOG(WARNING) << "Request thread num: " << thread_num + << ", exceed the little cores size: " << little_core_size + << ", truncate thread num to " << little_core_size; + active_ids_ = little_core_ids_; + } else { + for (int i = 0; i < thread_num; i++) { + active_ids_.push_back(little_core_ids_[i]); + } + } + } else { + mode_ = LITE_POWER_HIGH; + LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores"; + if (thread_num > big_core_size) { + active_ids_ = big_core_ids_; + } else { + for (int i = 0; i < thread_num; i++) { + active_ids_.push_back(big_core_ids_[i]); + } + } + } +} +void DeviceInfo::RequestPowerNoBindMode(const int thread_num) { + active_ids_.clear(); + for (int i = 0; i < thread_num; i++) { + active_ids_.push_back(0); + } + mode_ = LITE_POWER_NO_BIND; +} + +void DeviceInfo::RequestPowerRandHighMode(const int shift_num, + const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + if (big_core_size > 0) { + mode_ = LITE_POWER_RAND_HIGH; + if (thread_num > big_core_size) { + LOG(WARNING) << "Request thread num: " << thread_num + << ", exceed the big cores size: " << big_core_size + << ", truncate thread num to " << big_core_size; + active_ids_ = big_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]); + } + } + } else { + mode_ = LITE_POWER_LOW; + LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores."; + if (thread_num > little_core_size) { + active_ids_ = little_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(little_core_ids_[i]); + } + } + } +} + +void DeviceInfo::RequestPowerRandLowMode(const int shift_num, + const int thread_num) { + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + active_ids_.clear(); + if (little_core_size > 0) { + mode_ = LITE_POWER_RAND_LOW; + if (thread_num > little_core_size) { + LOG(WARNING) << "Request thread num: " << thread_num + << ", exceed the little cores size: " << little_core_size + << ", truncate thread num to " << little_core_size; + active_ids_ = little_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back( + little_core_ids_[(i + shift_num) % little_core_size]); + } + } + } else { + mode_ = LITE_POWER_HIGH; + LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores."; + if (thread_num > big_core_size) { + active_ids_ = big_core_ids_; + } else { + for (int i = 0; i < thread_num; ++i) { + active_ids_.push_back(big_core_ids_[i]); + } + } + } +} + +int DeviceInfo::Setup() { + core_num_ = get_cpu_num(); + mem_size_ = get_mem_size(); + get_cpu_arch(&archs_, core_num_); + // set defalut CPU info + SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE); + SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE); + SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE); +#ifdef LITE_WITH_LINUX + // get max&min freq + max_freqs_.resize(core_num_); + min_freqs_.resize(core_num_); + for (int i = 0; i < core_num_; ++i) { + int max_freq, min_freq; + get_cpu_max_min_freq(i, &max_freq, &min_freq); + max_freqs_[i] = max_freq / 1000; + min_freqs_[i] = min_freq / 1000; + } + // get cache size and big.LITTLE core ids + dev_name_ = get_cpu_name(); + if (!SetCPUInfoByName()) { + SetCPUInfoByProb(); + } + // output info + LOG(INFO) << "ARM multiprocessors name: " << dev_name_; + LOG(INFO) << "ARM multiprocessors number: " << core_num_; + for (int i = 0; i < core_num_; ++i) { + LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i] + << ", max freq: " << max_freqs_[i] + << ", min freq: " << min_freqs_[i] + << ", cluster ID: " << cluster_ids_[core_ids_[i]] + << ", CPU ARCH: A" << archs_[i]; + } + LOG(INFO) << "L1 DataCache size is: "; + for (int i = 0; i < core_num_; ++i) { + LOG(INFO) << L1_cache_[i] / 1024 << " KB"; + } + LOG(INFO) << "L2 Cache size is: "; + for (int i = 0; i < core_num_; ++i) { + LOG(INFO) << L2_cache_[i] / 1024 << " KB"; + } + LOG(INFO) << "Total memory: " << mem_size_ << "KB"; +#endif + // set default run mode + SetRunMode(LITE_POWER_NO_BIND, 1); // use single thread by default return 0; } +void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) { +#ifdef ARM_WITH_OMP + thread_num = std::min(thread_num, core_num_); +#else + thread_num = 1; // force thread_num to 1 if OpenMP is disabled +#endif +#ifdef LITE_WITH_LINUX + int big_core_size = big_core_ids_.size(); + int little_core_size = little_core_ids_.size(); + int big_little_core_size = big_core_size + little_core_size; + thread_num = std::min(thread_num, big_little_core_size); + count_++; + int shift_num = (count_ / 10) % big_core_size; + switch (mode) { + case LITE_POWER_FULL: + RequestPowerFullMode(thread_num); + break; + case LITE_POWER_HIGH: + RequestPowerHighMode(thread_num); + break; + case LITE_POWER_LOW: + RequestPowerLowMode(thread_num); + break; + case LITE_POWER_NO_BIND: + RequestPowerNoBindMode(thread_num); + break; + case LITE_POWER_RAND_HIGH: + RequestPowerRandHighMode(shift_num, thread_num); + break; + case LITE_POWER_RAND_LOW: + RequestPowerRandLowMode(shift_num, thread_num); + break; + default: + LOG(FATAL) << "Unsupported power mode: " << mode; + break; + } + if (active_ids_.size() == 0) { + active_ids_.push_back(0); + } +#ifdef ARM_WITH_OMP + omp_set_num_threads(active_ids_.size()); +#endif + if (mode_ != LITE_POWER_NO_BIND) { + if (check_cpu_online(active_ids_)) { + bind_threads(active_ids_); + } else { + LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE"; + mode_ = LITE_POWER_NO_BIND; + } + } +#else // LITE_WITH_LINUX + // only LITE_POWER_NO_BIND is supported in other OS + RequestPowerNoBindMode(thread_num); +#ifdef ARM_WITH_OMP + omp_set_num_threads(active_ids_.size()); +#endif #endif // LITE_WITH_LINUX + //! alloc memory for sgemm in this context + workspace_.Resize( + {static_cast(L2_cache_[active_ids_[0]] / sizeof(float))}); + arch_ = archs_[active_ids_[0]]; +} + +void DeviceInfo::SetCache(int l1size, int l2size, int l3size) { + SetCacheInfo(0, l1size); + SetCacheInfo(1, l2size); + SetCacheInfo(2, l3size); + workspace_.Resize({2 * (l1size + l2size)}); +} + +bool DeviceInfo::ExtendWorkspace(DDimLite dims) { + auto count = dims.product(); + auto old = workspace_.dims(); + if (count == old.product()) { + return false; + } + workspace_.Resize({static_cast( + count + L2_cache_[active_ids_[0]] / sizeof(float))}); + return true; +} #endif // LITE_WITH_ARM diff --git a/paddle/fluid/lite/core/cpu_info.h b/paddle/fluid/lite/core/cpu_info.h index b8c6ae95d..3c89d5eb4 100644 --- a/paddle/fluid/lite/core/cpu_info.h +++ b/paddle/fluid/lite/core/cpu_info.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include "paddle/fluid/lite/core/lite_tensor.h" @@ -47,92 +48,73 @@ typedef enum { class DeviceInfo { public: - int idx_; - int max_freq_; - int min_freq_; - int generate_arch_; - int compute_core_num_; - int max_memory_; - int sharemem_size_; - - std::string device_name_; - std::string compute_ability_; - - std::vector L1_cache_; - std::vector L2_cache_; - std::vector L3_cache_; - std::vector core_ids_; - std::vector big_core_ids_; - std::vector little_core_ids_; - std::vector cluster_ids_; - std::vector archs_; - - ARMArch arch_; - // LITE_POWER_HIGH stands for using big cores, - // LITE_POWER_LOW stands for using small core, - // LITE_POWER_FULL stands for using all cores - PowerMode mode_; - std::vector active_ids_; - TensorLite workspace_; - int64_t count_{0}; - static DeviceInfo& Global() { static auto* x = new DeviceInfo; return *x; } - static void Init() { - auto& info = Global(); - InitInternal(&info); + static int Init() { + static int ret = Global().Setup(); + return ret; } - void SetRunMode(PowerMode mode, int threads); + int Setup(); + + void SetRunMode(PowerMode mode, int thread_num); void SetCache(int l1size, int l2size, int l3size); void SetArch(ARMArch arch) { arch_ = arch; } - void BindDev(); PowerMode mode() const { return mode_; } int threads() const { return active_ids_.size(); } ARMArch arch() const { return arch_; } + int l1_cache_size() const { return L1_cache_[active_ids_[0]]; } + int l2_cache_size() const { return L2_cache_[active_ids_[0]]; } + int l3_cache_size() const { return L3_cache_[active_ids_[0]]; } template T* workspace_data() { return workspace_.mutable_data(); } - - int l1_cache_size() const { return L1_cache_[active_ids_[0]]; } - int l2_cache_size() const { return L2_cache_[active_ids_[0]]; } - int l3_cache_size() const { return L3_cache_[active_ids_[0]]; } bool ExtendWorkspace(DDimLite dims); private: - DeviceInfo() = default; - static void InitInternal(DeviceInfo* dev); -}; - -size_t arm_get_meminfo(); - -int arm_get_cpucount(); - -void arm_get_cpu_arch(std::vector* archs); - -bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name); + int core_num_; + std::vector max_freqs_; + std::vector min_freqs_; + int mem_size_; + std::string dev_name_; -#ifdef LITE_WITH_LINUX - -void set_default_cache(DeviceInfo* dev); - -std::string arm_get_cpu_name(); + std::vector L1_cache_; + std::vector L2_cache_; + std::vector L3_cache_; + std::vector core_ids_; + std::vector big_core_ids_; + std::vector little_core_ids_; + std::vector cluster_ids_; + std::vector archs_; -int get_max_freq_khz(int cpuid); + ARMArch arch_; + // LITE_POWER_HIGH stands for using big cores, + // LITE_POWER_LOW stands for using small core, + // LITE_POWER_FULL stands for using all cores + PowerMode mode_; + std::vector active_ids_; + TensorLite workspace_; + int64_t count_{0}; -int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector* cpuids, - const std::vector& cpu_freq, - std::vector* cluster_ids); -int check_online(const std::vector& core_ids); -int set_sched_affinity(const std::vector& cpuids); + void SetCacheInfo(int cache_id, int argc, ...); + void SetArchInfo(int argc, ...); + bool SetCPUInfoByName(); + void SetCPUInfoByProb(); + void RequestPowerFullMode(const int thread_num); + void RequestPowerHighMode(const int thread_num); + void RequestPowerLowMode(const int thread_num); + void RequestPowerNoBindMode(const int thread_num); + void RequestPowerRandHighMode(const int shift_num, const int thread_num); + void RequestPowerRandLowMode(const int shift_num, const int thread_num); -#endif // LITE_WITH_LINUX + DeviceInfo() = default; +}; #endif // LITE_WITH_ARM -- GitLab