提交 ad333ac5 编写于 作者: H Hong Ming

make all ARMContext share the same DeviceInfo, and export SetRunMode to allow...

make all ARMContext share the same DeviceInfo, and export SetRunMode to allow users to set the thread num
......@@ -29,9 +29,10 @@ double time_diff(Time t1, Time t2) {
return counter.count() / 1000.0;
}
void Run(const char* model_dir, int repeat) {
void Run(const char* model_dir, int repeat, int thread_num) {
#ifdef LITE_WITH_ARM
DeviceInfo::Init();
DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
#endif
lite::ExecutorLite predictor;
std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
......@@ -67,8 +68,8 @@ void Run(const char* model_dir, int repeat) {
} // namespace paddle
int main(int argc, char** argv) {
CHECK_EQ(argc, 3) << "usage: ./cmd <model_dir> <repeat>";
paddle::lite::Run(argv[1], std::stoi(argv[2]));
CHECK_EQ(argc, 4) << "usage: ./cmd <model_dir> <repeat> <thread_num>";
paddle::lite::Run(argv[1], std::stoi(argv[2]), std::stoi(argv[3]));
return 0;
}
......
......@@ -13,322 +13,7 @@
// limitations under the License.
#include "paddle/fluid/lite/core/context.h"
#include "paddle/fluid/lite/core/cpu_info.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
void Context<TargetType::kARM>::SetCache(int l1size, int l2size, int l3size) {
DeviceInfo& dev = DeviceInfo::Global();
int cpu_count = arm_get_cpucount();
dev.L1_cache_.resize(cpu_count);
dev.L2_cache_.resize(cpu_count);
dev.L3_cache_.resize(cpu_count);
for (int i = 0; i < cpu_count; ++i) {
dev.L1_cache_[i] = l1size;
dev.L2_cache_[i] = l2size;
dev.L3_cache_[i] = l3size;
}
workspace_.Resize({2 * (l1size + l2size)});
}
Context<TargetType::kARM>::Context() {
active_ids_ = {0};
mode_ = LITE_POWER_HIGH;
DeviceInfo& dev = DeviceInfo::Global();
workspace_.Resize(
{static_cast<int64_t>(dev.L2_cache_[active_ids_[0]] / sizeof(float))});
#ifdef TARGET_IOS
arch_ = APPLE; // use 6x8
#else
if (dev.big_core_ids_.size() > 0) {
arch_ = dev.archs_[dev.big_core_ids_[0]];
}
#endif
}
PowerMode Context<TargetType::kARM>::mode() const { return mode_; }
int Context<TargetType::kARM>::threads() const { return active_ids_.size(); }
Context<TargetType::kARM>::Context(const ARMContext& ctx) {
mode_ = ctx.mode_;
active_ids_ = ctx.active_ids_;
workspace_ = ctx.workspace_;
arch_ = ctx.arch_;
count_ = ctx.count_;
}
ARMContext& Context<TargetType::kARM>::operator=(const ARMContext& ctx) {
mode_ = ctx.mode_;
active_ids_ = ctx.active_ids_;
workspace_ = ctx.workspace_;
arch_ = ctx.arch_;
count_ = ctx.count_;
return *this;
}
void Context<TargetType::kARM>::BindDev() {
#ifdef ARM_WITH_OMP
int num_threads = active_ids_.size();
omp_set_num_threads(num_threads);
#ifdef LITE_WITH_LINUX
std::vector<int> ssarets;
for (int j = 0; j < num_threads; ++j) {
ssarets.push_back(0);
}
#pragma omp parallel for
for (int i = 0; i < num_threads; i++) {
ssarets[i] = set_sched_affinity(active_ids_);
}
for (int i = 0; i < num_threads; i++) {
if (ssarets[i] != 0) {
LOG(ERROR) << "set cpu affinity failed, cpuID: " << active_ids_[i];
return;
}
}
#endif // LITE_WITH_LINUX
#else // ARM_WITH_OMP
#ifdef LITE_WITH_LINUX
std::vector<int> cpuid1;
cpuid1.push_back(active_ids_[0]);
int ssaret = set_sched_affinity(cpuid1);
if (ssaret != 0) {
printf("set cpu affinity failed, cpuID: %d\n", active_ids_[0]);
return;
}
#endif // LITE_WITH_LINUX
#endif // ARM_WITH_OMP
}
void Context<TargetType::kARM>::SetRunMode(PowerMode mode, int threads) {
DeviceInfo& dev = DeviceInfo::Global();
int big_core_size = dev.big_core_ids_.size();
int small_core_size = dev.little_core_ids_.size();
if (threads > big_core_size + small_core_size) {
threads = big_core_size + small_core_size;
}
#ifdef ARM_WITH_OMP
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
mode_ = mode;
active_ids_.clear();
for (int i = 0; i < threads; ++i) {
if (i < big_core_size) {
active_ids_.push_back(dev.big_core_ids_[i]);
} else {
active_ids_.push_back(dev.little_core_ids_[i - big_core_size]);
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
if (threads > big_core_size) {
LOG(ERROR) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores";
if (threads > small_core_size) {
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (threads > big_core_size) {
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_NO_BIND:
mode_ = LITE_POWER_NO_BIND;
active_ids_.clear();
if (threads > dev.core_ids_.size()) {
active_ids_.resize(dev.core_ids_.size());
} else {
active_ids_.resize(threads);
}
break;
case LITE_POWER_RAND_HIGH:
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
if (threads > big_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the big cores size: " << big_core_size;
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
dev.big_core_ids_[(i + shift_num) % big_core_size]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(WARNING)
<< "HIGH POWER MODE is not support, switch to little cores";
if (threads > small_core_size) {
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.little_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
case LITE_POWER_RAND_LOW:
active_ids_.clear();
if (small_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
if (threads > small_core_size) {
LOG(WARNING) << "threads: " << threads
<< ", exceed the little cores size: " << small_core_size;
active_ids_ = dev.little_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(
dev.little_core_ids_[(i + shift_num) % small_core_size]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (threads > big_core_size) {
active_ids_ = dev.big_core_ids_;
} else {
for (int i = 0; i < threads; ++i) {
active_ids_.push_back(dev.big_core_ids_[i]);
}
}
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
break;
}
//! fix multi-threads LITE_POWER_HIGH mode
if (mode_ == LITE_POWER_NO_BIND || threads > 1) {
int threads = active_ids_.size();
omp_set_num_threads(threads);
} else {
if (check_online(active_ids_)) {
BindDev();
} else {
LOG(ERROR) << "core id " << active_ids_[0]
<< " is offline, switch to NO BIND MODE";
int threads = active_ids_.size();
omp_set_num_threads(threads);
}
}
#else
if (big_core_size > 0) {
active_ids_ = {dev.big_core_ids_[0]};
} else {
active_ids_ = {0};
}
#endif
//! alloc memory for sgemm in this context
int temp_mem_size =
DeviceInfo::Global().L2_cache_[active_ids_[0]] / sizeof(float);
workspace_.Resize({temp_mem_size});
arch_ = DeviceInfo::Global().archs_[active_ids_[0]];
}
ARMArch Context<TargetType::kARM>::arch() const { return arch_; }
void Context<TargetType::kARM>::SetArch(ARMArch arch) { arch_ = arch; }
int Context<TargetType::kARM>::l1_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L1_cache_[active_ids_[0]];
}
int Context<TargetType::kARM>::l2_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L2_cache_[active_ids_[0]];
}
int Context<TargetType::kARM>::l3_cache_size() const {
DeviceInfo& dev = DeviceInfo::Global();
return dev.L3_cache_[active_ids_[0]];
}
bool Context<TargetType::kARM>::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false;
}
workspace_.Resize(
{static_cast<int64_t>(count + l2_cache_size() / sizeof(float))});
return true;
}
#endif // LITE_WITH_ARM
} // namespace lite
namespace lite {} // namespace lite
} // namespace paddle
......@@ -61,47 +61,41 @@ class Context<TargetType::kHost> {
template <>
class Context<TargetType::kARM> {
public:
Context();
Context(PowerMode mode, int threads);
Context() {}
explicit Context(const ARMContext& ctx);
ARMContext& operator=(const ARMContext& ctx);
ARMContext& operator=(const ARMContext& ctx) {}
// NOTE: InitOnce should only be used by ContextScheduler
void InitOnce() { DeviceInfo::Init(); }
void CopyShared(const ARMContext* ctx) {}
void SetRunMode(PowerMode mode, int threads);
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch);
void BindDev();
void SetRunMode(PowerMode mode, int threads) {
return DeviceInfo::Global().SetRunMode(mode, threads);
}
void SetCache(int l1size, int l2size, int l3size) {
return DeviceInfo::Global().SetCache(l1size, l2size, l3size);
}
void SetArch(ARMArch arch) { return DeviceInfo::Global().SetArch(arch); }
PowerMode mode() const;
int threads() const;
ARMArch arch() const;
PowerMode mode() const { return DeviceInfo::Global().mode(); }
int threads() const { return DeviceInfo::Global().threads(); }
ARMArch arch() const { return DeviceInfo::Global().arch(); }
int l1_cache_size() const { return DeviceInfo::Global().l1_cache_size(); }
int l2_cache_size() const { return DeviceInfo::Global().l2_cache_size(); }
int l3_cache_size() const { return DeviceInfo::Global().l3_cache_size(); }
template <typename T>
T* workspace_data() {
return workspace_.mutable_data<T>();
return DeviceInfo::Global().workspace_data<T>();
}
int l1_cache_size() const;
int l2_cache_size() const;
int l3_cache_size() const;
bool ExtendWorkspace(DDimLite dims);
bool ExtendWorkspace(DDimLite dims) {
return DeviceInfo::Global().ExtendWorkspace(dims);
}
std::string name() const { return "ARMContext"; }
private:
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
ARMArch arch_;
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
};
#endif
......
......@@ -12,312 +12,81 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
#ifdef ARM_WITH_OMP
#include <omp.h>
#endif
#include <algorithm>
#include <limits>
#include "paddle/fluid/lite/core/cpu_info.h"
#include <cstdarg>
namespace paddle {
namespace lite {
#ifdef LITE_WITH_ARM
void DeviceInfo::InitInternal(DeviceInfo* dev) {
set_default_cache(dev);
dev->compute_core_num_ = arm_get_cpucount();
dev->max_memory_ = arm_get_meminfo();
// get max freq
#ifdef LITE_WITH_LINUX
std::vector<int> max_freq(dev->compute_core_num_);
for (int i = 0; i < dev->compute_core_num_; ++i) {
max_freq[i] = get_max_freq_khz(i) / 1000;
}
std::string cpu_name = arm_get_cpu_name();
if (get_cpu_info_from_name(dev, cpu_name) != true) {
arm_sort_cpuid_by_max_frequency(dev->compute_core_num_, &dev->core_ids_,
max_freq, &dev->cluster_ids_);
dev->big_core_ids_.clear();
dev->little_core_ids_.clear();
for (int i = 0; i < dev->cluster_ids_.size(); ++i) {
if (dev->cluster_ids_[i] == 0) {
dev->big_core_ids_.push_back(dev->core_ids_[i]);
} else {
dev->little_core_ids_.push_back(dev->core_ids_[i]);
}
}
arm_get_cpu_arch(&dev->archs_);
}
LOG(INFO) << "ARM multiprocessors number: " << dev->compute_core_num_;
for (int i = 0; i < dev->compute_core_num_; ++i) {
LOG(INFO) << "ARM multiprocessors ID: " << dev->core_ids_[i]
<< ", frequence: " << max_freq[i]
<< ", cluster ID: " << dev->cluster_ids_[dev->core_ids_[i]]
<< ", CPU ARCH: A" << dev->archs_[i];
}
VLOG(1) << "L1 DataCache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) {
VLOG(1) << dev->L1_cache_[i] / 1024 << " KB";
}
VLOG(1) << "L2 Cache size is: ";
for (int i = 0; i < dev->compute_core_num_; ++i) {
VLOG(1) << dev->L2_cache_[i] / 1024 << " KB";
}
VLOG(1) << "Total memory: " << dev->max_memory_ << "KB";
dev->max_freq_ = max_freq[0];
for (int j = 1; j < dev->compute_core_num_; ++j) {
if (dev->max_freq_ < max_freq[j]) {
dev->max_freq_ = max_freq[j];
}
}
#elif defined(TARGET_IOS)
arm_get_cpu_arch(&dev->archs_);
#ifdef TARGET_IOS
const int DEFAULT_L1_CACHE_SIZE = 64 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 2048 * 1024;
const int DEFAULT_L3_CACHE_SIZE = 0;
#else
const int DEFAULT_L1_CACHE_SIZE = 32 * 1024;
const int DEFAULT_L2_CACHE_SIZE = 512 * 1024;
const int DEFAULT_L3_CACHE_SIZE = 0;
#endif
}
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
void set_cache_info(DeviceInfo* cpu_info, int cache_id, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
std::vector<int>* cache;
switch (cache_id) {
case 0:
cache = &cpu_info->L1_cache_;
break;
case 1:
cache = &cpu_info->L2_cache_;
break;
case 2:
cache = &cpu_info->L3_cache_;
break;
default:
int get_cpu_num() {
#ifdef LITE_WITH_LINUX
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
int max_cpu_num = 20;
int cpu_num = 0;
for (int i = 0; i < max_cpu_num; ++i) {
char path[256];
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
FILE* fp = fopen(path, "rb");
if (!fp) {
break;
}
int core_num = cpu_info->compute_core_num_;
cache->resize(core_num);
if (argc == 1) {
int cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < core_num; ++i) {
(*cache)[i] = cache_size;
}
} else {
int big_core_num = cpu_info->big_core_ids_.size();
int little_core_num = cpu_info->little_core_ids_.size();
int big_core_cache_size = va_arg(arg_ptr, int);
int little_core_cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < big_core_num; ++i) {
(*cache)[cpu_info->big_core_ids_[i]] = big_core_cache_size;
}
for (int i = 0; i < little_core_num; ++i) {
(*cache)[cpu_info->little_core_ids_[i]] = little_core_cache_size;
}
cpu_num++;
fclose(fp);
}
va_end(arg_ptr);
}
void set_arch_info(DeviceInfo* cpu_info, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
int core_num = cpu_info->compute_core_num_;
cpu_info->archs_.resize(core_num);
if (argc == 1) {
ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
for (int i = 0; i < core_num; ++i) {
cpu_info->archs_[i] = arch;
}
} else {
ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
int big_core_num = cpu_info->big_core_ids_.size();
int little_core_num = cpu_info->little_core_ids_.size();
for (int i = 0; i < big_core_num; ++i) {
cpu_info->archs_[cpu_info->big_core_ids_[i]] = big_core_arch;
}
for (int i = 0; i < little_core_num; ++i) {
cpu_info->archs_[cpu_info->little_core_ids_[i]] = little_core_arch;
}
if (cpu_num < 1) {
cpu_num = 1;
}
va_end(arg_ptr);
}
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name) {
/* Snapdragon */
if (hardware_name.find("SDM845") != std::string::npos) { // 845
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA75, kA55);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 256 * 1024, 128 * 1024);
set_cache_info(cpu_info, 2, 1, 2048 * 1024);
return true;
} else if (hardware_name.find("SDM710") != std::string::npos) { // 710
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA75, kA55);
return true;
} else if (hardware_name.find("MSM8998") != std::string::npos) { // 835
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
set_cache_info(cpu_info, 0, 2, 64 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024,
/*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/
1024 * 1024);
return true;
} else if (hardware_name.find("MSM8996") != std::string::npos) { // 820
cpu_info->compute_core_num_ = 4;
cpu_info->core_ids_ = {0, 1, 2, 3};
cpu_info->big_core_ids_ = {2, 3};
cpu_info->little_core_ids_ = {0, 1};
cpu_info->cluster_ids_ = {1, 1, 0, 0};
set_arch_info(cpu_info, 1, kA72);
set_cache_info(cpu_info, 0, 1, 24 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("SDM660") != std::string::npos ||
hardware_name.find("SDM636") != std::string::npos) { // 660, 636
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA73);
set_cache_info(cpu_info, 0, 2, 64 * 1024, 32 * 1024);
set_cache_info(cpu_info, 1, 1, 1024 * 1024);
return true;
} else if (hardware_name.find("MSM8976") != std::string::npos) { // 652,653
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA72, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("MSM8953") != std::string::npos) { // 625
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 1, 1024 * 1024);
return true;
} else if (hardware_name.find("MSM8939") != std::string::npos) { // 615
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3};
cpu_info->little_core_ids_ = {4, 5, 6, 7};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
set_arch_info(cpu_info, 1, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 512 * 1024, 256 * 1024);
return true;
/* MediaTek */
} else if (hardware_name.find("MT6797") !=
std::string::npos) { // X20/X23/X25/X27
cpu_info->compute_core_num_ = 10;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
cpu_info->big_core_ids_ = {8, 9};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA72, kA53);
set_cache_info(cpu_info, 0, 1, 32 * 1024);
set_cache_info(cpu_info, 1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (hardware_name.find("MT6799") != std::string::npos) { // X30
cpu_info->compute_core_num_ = 10;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
cpu_info->big_core_ids_ = {8, 9};
cpu_info->little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
return true;
} else if (hardware_name.find("MT6795") != std::string::npos ||
hardware_name.find("MT6762") != std::string::npos ||
hardware_name.find("MT6755T") != std::string::npos ||
hardware_name.find("MT6755S") != std::string::npos ||
hardware_name.find("MT6753") != std::string::npos ||
hardware_name.find("MT6752") != std::string::npos ||
hardware_name.find("MT6750") != std::string::npos) {
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
} else if (hardware_name.find("MT6758") != std::string::npos ||
hardware_name.find("MT6757") != std::string::npos ||
hardware_name.find("MT6763") != std::string::npos ||
hardware_name.find("MT6755M") != std::string::npos ||
hardware_name.find("MT6755") !=
std::string::npos) { // P30, P20/P25, P23, P10
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
} else if (hardware_name.find("MT6771") != std::string::npos) { // P60
cpu_info->compute_core_num_ = 8;
cpu_info->core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cpu_info->big_core_ids_ = {4, 5, 6, 7};
cpu_info->little_core_ids_ = {0, 1, 2, 3};
cpu_info->cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
set_arch_info(cpu_info, 2, kA73, kA53);
return true;
} else if (hardware_name.find("MT6765") != std::string::npos ||
hardware_name.find("MT6739") != std::string::npos ||
hardware_name.find("MT6738") != std::string::npos ||
hardware_name.find("MT6737") !=
std::string::npos) { // A22, MT6739, MT6738, MT6767
cpu_info->compute_core_num_ = 4;
cpu_info->core_ids_ = {0, 1, 2, 3};
cpu_info->big_core_ids_ = {0, 0, 0, 0};
cpu_info->little_core_ids_ = {};
cpu_info->cluster_ids_ = {0, 0, 0, 0};
set_arch_info(cpu_info, 1, kA53);
return true;
return cpu_num;
#elif defined(TARGET_IOS)
int cpu_num = 0;
size_t len = sizeof(cpu_num);
sysctlbyname("hw.ncpu", &cpu_num, &len, NULL, 0);
if (cpu_num < 1) {
cpu_num = 1;
}
return false;
return cpu_num;
#else
return 1;
#endif
}
size_t arm_get_meminfo() {
size_t get_mem_size() {
#ifdef LITE_WITH_LINUX
// get cpu count from /proc/cpuinfo
FILE* fp = fopen("/proc/meminfo", "rb");
if (!fp) {
return 1;
}
size_t memsize = 0;
char line[1024];
while (!feof(fp)) {
......@@ -327,52 +96,18 @@ size_t arm_get_meminfo() {
}
sscanf(s, "MemTotal: %d kB", &memsize);
}
fclose(fp);
return memsize;
#elif defined(TARGET_IOS)
// to be implemented
printf("not implemented\n");
return 0;
#endif
}
int arm_get_cpucount() {
#ifdef LITE_WITH_LINUX
// get cpu count from /sys/devices/system/cpu/cpunum/uevent
int max_cpu_count = 20;
int count = 0;
for (int i = 0; i < max_cpu_count; ++i) {
char path[256];
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/uevent", i);
FILE* fp = fopen(path, "rb");
if (!fp) {
break;
}
count++;
fclose(fp);
}
if (count < 1) {
count = 1;
}
return count;
#elif defined(TARGET_IOS)
int count = 0;
size_t len = sizeof(count);
sysctlbyname("hw.ncpu", &count, &len, NULL, 0);
if (count < 1) {
count = 1;
}
return count;
#else
return 1;
#endif
return 0;
}
void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
#ifdef LITE_WITH_LINUX
void get_cpu_arch(std::vector<ARMArch>* archs, const int cpu_num) {
archs->clear();
#ifdef LITE_WITH_LINUX
//! get CPU ARCH
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
......@@ -406,6 +141,29 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
case 0xd0a:
archs->push_back(kA75);
break;
case 0xd40:
archs->push_back(kA76);
break;
case 0x804:
// 855
archs->push_back(kA76);
break;
case 0x805:
// 855
archs->push_back(kA55);
break;
case 0x802:
// 845
archs->push_back(kA75);
break;
case 0x803:
// 845
archs->push_back(kA55);
break;
case 0x801:
// 835
archs->push_back(kA73);
break;
case 0x800:
// 835
archs->push_back(kA73);
......@@ -415,49 +173,31 @@ void arm_get_cpu_arch(std::vector<ARMArch>* archs) {
archs->push_back(kA72);
break;
default:
LOG(ERROR) << "unknow type";
LOG(ERROR) << "Unknow cpu arch: " << arch_id;
archs->push_back(kARMArch_UNKOWN);
}
}
}
fclose(fp);
int cpu_count = arm_get_cpucount();
if (archs->size() < cpu_count) {
for (int i = archs->size(); i < cpu_count; ++i) {
if (archs->size() < cpu_num) {
for (int i = archs->size(); i < cpu_num; ++i) {
archs->push_back(archs->at(i - 1));
}
}
#endif
#ifdef TARGET_IOS
int cpu_count = arm_get_cpucount();
for (int i = 0; i < cpu_count; ++i) {
#elif defined(TARGET_IOS)
for (int i = 0; i < cpu_num; ++i) {
archs->push_back(APPLE);
}
#else
for (int i = 0; i < cpu_num; ++i) {
archs->push_back(kARMArch_UNKOWN);
}
#endif
}
#ifdef LITE_WITH_LINUX
void set_default_cache(DeviceInfo* dev) {
int cpu_count = arm_get_cpucount();
dev->L1_cache_.resize(cpu_count);
dev->L2_cache_.resize(cpu_count);
dev->L3_cache_.resize(cpu_count);
#ifdef TARGET_IOS
for (int i = 0; i < cpu_count; ++i) {
dev->L1_cache_[i] = 64 * 1024;
dev->L2_cache_[i] = 2048 * 1024;
dev->L3_cache_[i] = 0;
}
#else
for (int i = 0; i < cpu_count; ++i) {
dev->L1_cache_[i] = 32 * 1024;
dev->L2_cache_[i] = 512 * 1024;
dev->L3_cache_[i] = 0;
}
#endif
}
std::string arm_get_cpu_name() {
std::string get_cpu_name() {
FILE* fp = fopen("/proc/cpuinfo", "rb");
if (!fp) {
return "";
......@@ -477,122 +217,163 @@ std::string arm_get_cpu_name() {
return "";
}
int get_max_freq_khz(int cpuid) {
void get_cpu_max_min_freq(int cpu_id, int* max_freq, int* min_freq) {
*max_freq = 0;
*min_freq = 0;
// first try, for all possible cpu
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpuid);
"/sys/devices/system/cpu/cpufreq/stats/cpu%d/time_in_state", cpu_id);
FILE* fp = fopen(path, "rb");
if (!fp) {
// second try, for online cpu
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/stats/time_in_state",
cpuid);
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
// third try, for online cpu
// get max_freq
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq", cpuid);
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_max_freq",
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
return -1;
return;
}
int max_freq_khz = -1;
fscanf(fp, "%d", &max_freq_khz);
fscanf(fp, "%d", max_freq);
fclose(fp);
return max_freq_khz;
// get min_freq
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cpufreq/cpuinfo_min_freq",
cpu_id);
fp = fopen(path, "rb");
if (!fp) {
return;
}
fscanf(fp, "%d", min_freq);
fclose(fp);
return;
}
}
int max_freq_khz = 0;
*min_freq = std::numeric_limits<int>::max();
while (!feof(fp)) {
int freq_khz = 0;
int nscan = fscanf(fp, "%d %*d", &freq_khz);
int freq = 0;
int nscan = fscanf(fp, "%d %*d", &freq);
if (nscan != 1) {
break;
}
if (freq_khz > max_freq_khz) {
max_freq_khz = freq_khz;
if (freq > *max_freq) {
*max_freq = freq;
}
if (freq < *min_freq) {
*min_freq = freq;
}
}
fclose(fp);
return max_freq_khz;
}
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
const std::vector<int>& cpu_freq,
std::vector<int>* cluster_ids) {
if (cpu_count == 0) {
return 0;
void sort_cpuid_by_max_freq(const std::vector<int>& max_freqs,
std::vector<int>* cpu_ids,
std::vector<int>* cluster_ids) {
int cpu_num = max_freqs.size();
if (cpu_num == 0) {
return;
}
cpuids->resize(cpu_count);
cluster_ids->resize(cpu_count);
for (int i = 0; i < cpu_count; i++) {
cpuids->at(i) = i;
cpu_ids->resize(cpu_num);
cluster_ids->resize(cpu_num);
for (int i = 0; i < cpu_num; i++) {
cpu_ids->at(i) = i;
}
// sort cpuid as big core first
// simple bubble sort
for (int i = 0; i < cpu_count; i++) {
for (int j = i + 1; j < cpu_count; j++) {
if (cpu_freq[i] < cpu_freq[j]) {
for (int i = 0; i < cpu_num; i++) {
for (int j = i + 1; j < cpu_num; j++) {
if (max_freqs[i] < max_freqs[j]) {
// swap
int tmp = cpuids->at(i);
cpuids->at(i) = cpuids->at(j);
cpuids->at(j) = tmp;
int tmp = cpu_ids->at(i);
cpu_ids->at(i) = cpu_ids->at(j);
cpu_ids->at(j) = tmp;
}
}
}
// SMP
int mid_max_freq_khz =
(cpu_freq[cpuids->at(0)] + cpu_freq[cpuids->at(cpu_count - 1)]) / 2;
int mid_max_freq =
(max_freqs[cpu_ids->at(0)] + max_freqs[cpu_ids->at(cpu_num - 1)]) / 2;
for (int i = 0; i < cpu_count; i++) {
cpuids->at(i) = i;
if (cpu_freq[i] >= mid_max_freq_khz) {
for (int i = 0; i < cpu_num; i++) {
cpu_ids->at(i) = i;
if (max_freqs[i] >= mid_max_freq) {
cluster_ids->at(i) = 0;
} else {
cluster_ids->at(i) = 1;
}
}
return 0;
}
int check_online(const std::vector<int>& core_ids) {
if (core_ids.size() == 0) {
return 0;
void get_cpu_cache_size(int cpu_id, int* l1_cache_size, int* l2_cache_size,
int* l3_cache_size) {
int max_cache_idx_num = 10;
*l1_cache_size = DEFAULT_L1_CACHE_SIZE;
*l2_cache_size = DEFAULT_L2_CACHE_SIZE;
*l3_cache_size = DEFAULT_L3_CACHE_SIZE;
for (int i = 0; i < max_cache_idx_num; i++) {
char path[256];
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cache/index%d/level", cpu_id, i);
FILE* fp = fopen(path, "rb");
if (fp) {
int level = -1;
fscanf(fp, "%d", &level);
fclose(fp);
snprintf(path, sizeof(path),
"/sys/devices/system/cpu/cpu%d/cache/index%d/size", cpu_id, i);
fp = fopen(path, "rb");
if (fp) {
int size = -1;
fscanf(fp, "%d", &size);
fclose(fp);
if (size >= 0) {
if (level == 1) {
*l1_cache_size = size * 1024;
} else if (level == 2) {
*l2_cache_size = size * 1024;
} else if (level == 3) {
*l3_cache_size = size * 1024;
}
}
}
}
}
}
bool check_cpu_online(const std::vector<int>& cpu_ids) {
if (cpu_ids.size() == 0) {
return false;
}
char path[256];
int online = 1;
for (int i = 0; i < core_ids.size(); ++i) {
bool all_online = true;
for (int i = 0; i < cpu_ids.size(); ++i) {
snprintf(path, sizeof(path), "/sys/devices/system/cpu/cpu%d/online",
core_ids[i]);
cpu_ids[i]);
FILE* fp = fopen(path, "rb");
if (!fp) {
return 0;
int is_online = 0;
if (fp) {
fscanf(fp, "%d", &is_online);
fclose(fp);
} else {
LOG(ERROR) << "Failed to query the online statue of CPU id:"
<< cpu_ids[i];
}
if (is_online == 0) {
all_online = false;
LOG(ERROR) << "CPU id:" << cpu_ids[i] << " is offine";
}
int cur_online = 0;
fscanf(fp, "%d", &cur_online);
online &= cur_online;
fclose(fp);
}
return online;
return all_online;
}
int set_sched_affinity(const std::vector<int>& cpuids) {
int set_sched_affinity(const std::vector<int>& cpu_ids) {
// #define CPU_SETSIZE 1024
// #define __NCPUBITS (8 * sizeof (unsigned long))
// typedef struct
......@@ -608,20 +389,569 @@ int set_sched_affinity(const std::vector<int>& cpuids) {
#endif
cpu_set_t mask;
CPU_ZERO(&mask);
for (int i = 0; i < cpuids.size(); i++) {
CPU_SET(cpuids[i], &mask);
for (int i = 0; i < cpu_ids.size(); ++i) {
CPU_SET(cpu_ids[i], &mask);
}
int syscallret = syscall(__NR_sched_setaffinity, pid, sizeof(mask), &mask);
if (syscallret) {
LOG(ERROR) << "syscall error " << syscallret;
return -1;
}
return 0;
}
bool bind_threads(const std::vector<int> cpu_ids) {
#ifdef ARM_WITH_OMP
int thread_num = cpu_ids.size();
omp_set_num_threads(thread_num);
std::vector<int> ssarets;
for (int i = 0; i < thread_num; ++i) {
ssarets.push_back(0);
}
#pragma omp parallel for
for (int i = 0; i < thread_num; i++) {
ssarets[i] = set_sched_affinity(cpu_ids);
}
for (int i = 0; i < thread_num; i++) {
if (ssarets[i] != 0) {
LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[i];
return false;
}
}
#else // ARM_WITH_OMP
std::vector<int> first_cpu_id;
first_cpu_id.push_back(cpu_ids[0]);
int ssaret = set_sched_affinity(first_cpu_id);
if (ssaret != 0) {
LOG(ERROR) << "Set cpu affinity failed, core id: " << cpu_ids[0];
return false;
}
#endif // ARM_WITH_OMP
}
#endif // LITE_WITH_LINUX
// cache_id : 0 -> L1, 1 -> L2, 2 -> L3
void DeviceInfo::SetCacheInfo(int cache_id, int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
std::vector<int>* cache;
switch (cache_id) {
case 0:
cache = &L1_cache_;
break;
case 1:
cache = &L2_cache_;
break;
case 2:
cache = &L3_cache_;
break;
default:
break;
}
cache->resize(core_num_);
if (argc == 1) {
int cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < core_num_; ++i) {
(*cache)[i] = cache_size;
}
} else {
int big_core_num = big_core_ids_.size();
int little_core_num = little_core_ids_.size();
int big_core_cache_size = va_arg(arg_ptr, int);
int little_core_cache_size = va_arg(arg_ptr, int);
for (int i = 0; i < big_core_num; ++i) {
(*cache)[big_core_ids_[i]] = big_core_cache_size;
}
for (int i = 0; i < little_core_num; ++i) {
(*cache)[little_core_ids_[i]] = little_core_cache_size;
}
}
va_end(arg_ptr);
}
void DeviceInfo::SetArchInfo(int argc, ...) {
va_list arg_ptr;
va_start(arg_ptr, argc);
archs_.resize(core_num_);
if (argc == 1) {
ARMArch arch = (ARMArch)va_arg(arg_ptr, int);
for (int i = 0; i < core_num_; ++i) {
archs_[i] = arch;
}
} else {
ARMArch big_core_arch = (ARMArch)va_arg(arg_ptr, int);
ARMArch little_core_arch = (ARMArch)va_arg(arg_ptr, int);
int big_core_num = big_core_ids_.size();
int little_core_num = little_core_ids_.size();
for (int i = 0; i < big_core_num; ++i) {
archs_[big_core_ids_[i]] = big_core_arch;
}
for (int i = 0; i < little_core_num; ++i) {
archs_[little_core_ids_[i]] = little_core_arch;
}
}
va_end(arg_ptr);
}
bool DeviceInfo::SetCPUInfoByName() {
/* Snapdragon */
if (dev_name_.find("SM8150") != std::string::npos) { // 855
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA76, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 2048 * 1024);
return true;
} else if (dev_name_.find("SDM845") != std::string::npos) { // 845
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA75, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 2048 * 1024);
return true;
} else if (dev_name_.find("SDM710") != std::string::npos) { // 710
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {6, 7};
little_core_ids_ = {0, 1, 2, 3, 4, 5};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA75, kA55);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 256 * 1024, 128 * 1024);
SetCacheInfo(2, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8998") != std::string::npos) { // 835
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA73, kA53);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024,
/*real cache size is 2M, while that will get bad performace
on conv3x3s1 or gemm, set to 1M or 512K*/
1024 * 1024);
return true;
} else if (dev_name_.find("MSM8996") != std::string::npos) { // 820
core_num_ = 4;
core_ids_ = {0, 1, 2, 3};
big_core_ids_ = {2, 3};
little_core_ids_ = {0, 1};
cluster_ids_ = {1, 1, 0, 0};
SetArchInfo(1, kA72);
SetCacheInfo(0, 1, 24 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("SDM660") != std::string::npos ||
dev_name_.find("SDM636") != std::string::npos) { // 660, 636
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(1, kA73);
SetCacheInfo(0, 2, 64 * 1024, 32 * 1024);
SetCacheInfo(1, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8976") != std::string::npos) { // 652,653
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA72, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("MSM8953") != std::string::npos) { // 625
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
SetArchInfo(1, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 1, 1024 * 1024);
return true;
} else if (dev_name_.find("MSM8939") != std::string::npos) { // 615
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3};
little_core_ids_ = {4, 5, 6, 7};
cluster_ids_ = {0, 0, 0, 0, 1, 1, 1, 1};
SetArchInfo(1, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 512 * 1024, 256 * 1024);
return true;
/* MediaTek */
} else if (dev_name_.find("MT6797") !=
std::string::npos) { // X20/X23/X25/X27
core_num_ = 10;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
big_core_ids_ = {8, 9};
little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA72, kA53);
SetCacheInfo(0, 1, 32 * 1024);
SetCacheInfo(1, 2, 1024 * 1024, 512 * 1024);
return true;
} else if (dev_name_.find("MT6799") != std::string::npos) { // X30
core_num_ = 10;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9};
big_core_ids_ = {8, 9};
little_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
cluster_ids_ = {1, 1, 1, 1, 1, 1, 1, 1, 0, 0};
SetArchInfo(2, kA73, kA53);
return true;
} else if (dev_name_.find("MT6795") != std::string::npos ||
dev_name_.find("MT6762") != std::string::npos ||
dev_name_.find("MT6755T") != std::string::npos ||
dev_name_.find("MT6755S") != std::string::npos ||
dev_name_.find("MT6753") != std::string::npos ||
dev_name_.find("MT6752") != std::string::npos ||
dev_name_.find("MT6750") != std::string::npos) {
// X10, P22, P15/P18, MT6753, MT6752/MT6752M, MT6750
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0, 0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
} else if (dev_name_.find("MT6758") != std::string::npos ||
dev_name_.find("MT6757") != std::string::npos ||
dev_name_.find("MT6763") != std::string::npos ||
dev_name_.find("MT6755M") != std::string::npos ||
dev_name_.find("MT6755") !=
std::string::npos) { // P30, P20/P25, P23, P10
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
} else if (dev_name_.find("MT6771") != std::string::npos) { // P60
core_num_ = 8;
core_ids_ = {0, 1, 2, 3, 4, 5, 6, 7};
big_core_ids_ = {4, 5, 6, 7};
little_core_ids_ = {0, 1, 2, 3};
cluster_ids_ = {1, 1, 1, 1, 0, 0, 0, 0};
SetArchInfo(2, kA73, kA53);
return true;
} else if (dev_name_.find("MT6765") != std::string::npos ||
dev_name_.find("MT6739") != std::string::npos ||
dev_name_.find("MT6738") != std::string::npos ||
dev_name_.find("MT6737") !=
std::string::npos) { // A22, MT6739, MT6738, MT6767
core_num_ = 4;
core_ids_ = {0, 1, 2, 3};
big_core_ids_ = {0, 1, 2, 3};
little_core_ids_ = {};
cluster_ids_ = {0, 0, 0, 0};
SetArchInfo(1, kA53);
return true;
}
return false;
}
void DeviceInfo::SetCPUInfoByProb() {
#ifdef LITE_WITH_LINUX
// get big.LITTLE cores by sorting CPU frequency
sort_cpuid_by_max_freq(max_freqs_, &core_ids_, &cluster_ids_);
big_core_ids_.clear();
little_core_ids_.clear();
for (int i = 0; i < cluster_ids_.size(); ++i) {
if (cluster_ids_[i] == 0) {
big_core_ids_.push_back(core_ids_[i]);
} else {
little_core_ids_.push_back(core_ids_[i]);
}
}
// get l1, l2, l3 cache size for each core
for (int i = 0; i < core_num_; i++) {
get_cpu_cache_size(i, &(L1_cache_[i]), &(L2_cache_[i]), &(L3_cache_[i]));
}
#endif // LITE_WITH_LINUX
}
void DeviceInfo::RequestPowerFullMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
for (int i = 0; i < thread_num; ++i) {
if (i < big_core_size) {
active_ids_.push_back(big_core_ids_[i]);
} else if (i < big_core_size + little_core_size) {
active_ids_.push_back(little_core_ids_[i - big_core_size]);
}
}
mode_ = LITE_POWER_FULL;
}
void DeviceInfo::RequestPowerHighMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (big_core_size > 0) {
mode_ = LITE_POWER_HIGH;
if (thread_num > big_core_size) {
LOG(ERROR) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
<< ", truncate thread num to " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(ERROR) << "HIGH POWER MODE is not support, switch to little cores.";
if (thread_num > little_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerLowMode(const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (little_core_size > 0) {
mode_ = LITE_POWER_LOW;
if (thread_num > little_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
<< ", truncate thread num to " << little_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(little_core_ids_[i]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(big_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerNoBindMode(const int thread_num) {
active_ids_.clear();
for (int i = 0; i < thread_num; i++) {
active_ids_.push_back(0);
}
mode_ = LITE_POWER_NO_BIND;
}
void DeviceInfo::RequestPowerRandHighMode(const int shift_num,
const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
if (big_core_size > 0) {
mode_ = LITE_POWER_RAND_HIGH;
if (thread_num > big_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the big cores size: " << big_core_size
<< ", truncate thread num to " << big_core_size;
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[(i + shift_num) % big_core_size]);
}
}
} else {
mode_ = LITE_POWER_LOW;
LOG(WARNING) << "HIGH POWER MODE is not support, switch to little cores.";
if (thread_num > little_core_size) {
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(little_core_ids_[i]);
}
}
}
}
void DeviceInfo::RequestPowerRandLowMode(const int shift_num,
const int thread_num) {
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
active_ids_.clear();
if (little_core_size > 0) {
mode_ = LITE_POWER_RAND_LOW;
if (thread_num > little_core_size) {
LOG(WARNING) << "Request thread num: " << thread_num
<< ", exceed the little cores size: " << little_core_size
<< ", truncate thread num to " << little_core_size;
active_ids_ = little_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(
little_core_ids_[(i + shift_num) % little_core_size]);
}
}
} else {
mode_ = LITE_POWER_HIGH;
LOG(WARNING) << "LOW POWER MODE is not support, switch to big cores.";
if (thread_num > big_core_size) {
active_ids_ = big_core_ids_;
} else {
for (int i = 0; i < thread_num; ++i) {
active_ids_.push_back(big_core_ids_[i]);
}
}
}
}
int DeviceInfo::Setup() {
core_num_ = get_cpu_num();
mem_size_ = get_mem_size();
get_cpu_arch(&archs_, core_num_);
// set defalut CPU info
SetCacheInfo(0, DEFAULT_L1_CACHE_SIZE);
SetCacheInfo(1, DEFAULT_L2_CACHE_SIZE);
SetCacheInfo(2, DEFAULT_L3_CACHE_SIZE);
#ifdef LITE_WITH_LINUX
// get max&min freq
max_freqs_.resize(core_num_);
min_freqs_.resize(core_num_);
for (int i = 0; i < core_num_; ++i) {
int max_freq, min_freq;
get_cpu_max_min_freq(i, &max_freq, &min_freq);
max_freqs_[i] = max_freq / 1000;
min_freqs_[i] = min_freq / 1000;
}
// get cache size and big.LITTLE core ids
dev_name_ = get_cpu_name();
if (!SetCPUInfoByName()) {
SetCPUInfoByProb();
}
// output info
LOG(INFO) << "ARM multiprocessors name: " << dev_name_;
LOG(INFO) << "ARM multiprocessors number: " << core_num_;
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << "ARM multiprocessors ID: " << core_ids_[i]
<< ", max freq: " << max_freqs_[i]
<< ", min freq: " << min_freqs_[i]
<< ", cluster ID: " << cluster_ids_[core_ids_[i]]
<< ", CPU ARCH: A" << archs_[i];
}
LOG(INFO) << "L1 DataCache size is: ";
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << L1_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "L2 Cache size is: ";
for (int i = 0; i < core_num_; ++i) {
LOG(INFO) << L2_cache_[i] / 1024 << " KB";
}
LOG(INFO) << "Total memory: " << mem_size_ << "KB";
#endif
// set default run mode
SetRunMode(LITE_POWER_NO_BIND, 1); // use single thread by default
return 0;
}
void DeviceInfo::SetRunMode(PowerMode mode, int thread_num) {
#ifdef ARM_WITH_OMP
thread_num = std::min(thread_num, core_num_);
#else
thread_num = 1; // force thread_num to 1 if OpenMP is disabled
#endif
#ifdef LITE_WITH_LINUX
int big_core_size = big_core_ids_.size();
int little_core_size = little_core_ids_.size();
int big_little_core_size = big_core_size + little_core_size;
thread_num = std::min(thread_num, big_little_core_size);
count_++;
int shift_num = (count_ / 10) % big_core_size;
switch (mode) {
case LITE_POWER_FULL:
RequestPowerFullMode(thread_num);
break;
case LITE_POWER_HIGH:
RequestPowerHighMode(thread_num);
break;
case LITE_POWER_LOW:
RequestPowerLowMode(thread_num);
break;
case LITE_POWER_NO_BIND:
RequestPowerNoBindMode(thread_num);
break;
case LITE_POWER_RAND_HIGH:
RequestPowerRandHighMode(shift_num, thread_num);
break;
case LITE_POWER_RAND_LOW:
RequestPowerRandLowMode(shift_num, thread_num);
break;
default:
LOG(FATAL) << "Unsupported power mode: " << mode;
break;
}
if (active_ids_.size() == 0) {
active_ids_.push_back(0);
}
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
if (mode_ != LITE_POWER_NO_BIND) {
if (check_cpu_online(active_ids_)) {
bind_threads(active_ids_);
} else {
LOG(WARNING) << "Some cores are offline, switch to NO BIND MODE";
mode_ = LITE_POWER_NO_BIND;
}
}
#else // LITE_WITH_LINUX
// only LITE_POWER_NO_BIND is supported in other OS
RequestPowerNoBindMode(thread_num);
#ifdef ARM_WITH_OMP
omp_set_num_threads(active_ids_.size());
#endif
#endif // LITE_WITH_LINUX
//! alloc memory for sgemm in this context
workspace_.Resize(
{static_cast<int64_t>(L2_cache_[active_ids_[0]] / sizeof(float))});
arch_ = archs_[active_ids_[0]];
}
void DeviceInfo::SetCache(int l1size, int l2size, int l3size) {
SetCacheInfo(0, l1size);
SetCacheInfo(1, l2size);
SetCacheInfo(2, l3size);
workspace_.Resize({2 * (l1size + l2size)});
}
bool DeviceInfo::ExtendWorkspace(DDimLite dims) {
auto count = dims.product();
auto old = workspace_.dims();
if (count == old.product()) {
return false;
}
workspace_.Resize({static_cast<int64_t>(
count + L2_cache_[active_ids_[0]] / sizeof(float))});
return true;
}
#endif // LITE_WITH_ARM
......
......@@ -14,24 +14,12 @@
#pragma once
#include <cstdarg>
#include <string>
#include <vector>
#include "paddle/fluid/lite/core/lite_tensor.h"
#include "paddle/fluid/lite/utils/cp_logging.h"
#ifdef LITE_WITH_LINUX
#include <sys/syscall.h>
#include <unistd.h>
#endif
#if __APPLE__
#include "TargetConditionals.h"
#if TARGET_OS_IPHONE
#include <mach/machine.h>
#include <sys/sysctl.h>
#include <sys/types.h>
#endif // TARGET_OS_IPHONE
#endif // __APPLE__
namespace paddle {
namespace lite {
......@@ -60,64 +48,73 @@ typedef enum {
class DeviceInfo {
public:
int idx_;
int max_freq_;
int min_freq_;
int generate_arch_;
int compute_core_num_;
int max_memory_;
int sharemem_size_;
std::string device_name_;
std::string compute_ability_;
std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
static DeviceInfo& Global() {
static auto* x = new DeviceInfo;
return *x;
}
static void Init() {
auto& info = Global();
InitInternal(&info);
static int Init() {
static int ret = Global().Setup();
return ret;
}
private:
DeviceInfo() = default;
static void InitInternal(DeviceInfo* dev);
};
int Setup();
size_t arm_get_meminfo();
void SetRunMode(PowerMode mode, int thread_num);
void SetCache(int l1size, int l2size, int l3size);
void SetArch(ARMArch arch) { arch_ = arch; }
int arm_get_cpucount();
PowerMode mode() const { return mode_; }
int threads() const { return active_ids_.size(); }
ARMArch arch() const { return arch_; }
int l1_cache_size() const { return L1_cache_[active_ids_[0]]; }
int l2_cache_size() const { return L2_cache_[active_ids_[0]]; }
int l3_cache_size() const { return L3_cache_[active_ids_[0]]; }
void arm_get_cpu_arch(std::vector<ARMArch>* archs);
bool get_cpu_info_from_name(DeviceInfo* cpu_info, std::string hardware_name);
#ifdef LITE_WITH_LINUX
void set_default_cache(DeviceInfo* dev);
template <typename T>
T* workspace_data() {
return workspace_.mutable_data<T>();
}
bool ExtendWorkspace(DDimLite dims);
std::string arm_get_cpu_name();
private:
int core_num_;
std::vector<int> max_freqs_;
std::vector<int> min_freqs_;
int mem_size_;
std::string dev_name_;
int get_max_freq_khz(int cpuid);
std::vector<int> L1_cache_;
std::vector<int> L2_cache_;
std::vector<int> L3_cache_;
std::vector<int> core_ids_;
std::vector<int> big_core_ids_;
std::vector<int> little_core_ids_;
std::vector<int> cluster_ids_;
std::vector<ARMArch> archs_;
int arm_sort_cpuid_by_max_frequency(int cpu_count, std::vector<int>* cpuids,
const std::vector<int>& cpu_freq,
std::vector<int>* cluster_ids);
int check_online(const std::vector<int>& core_ids);
int set_sched_affinity(const std::vector<int>& cpuids);
ARMArch arch_;
// LITE_POWER_HIGH stands for using big cores,
// LITE_POWER_LOW stands for using small core,
// LITE_POWER_FULL stands for using all cores
PowerMode mode_;
std::vector<int> active_ids_;
TensorLite workspace_;
int64_t count_{0};
void SetCacheInfo(int cache_id, int argc, ...);
void SetArchInfo(int argc, ...);
bool SetCPUInfoByName();
void SetCPUInfoByProb();
void RequestPowerFullMode(const int thread_num);
void RequestPowerHighMode(const int thread_num);
void RequestPowerLowMode(const int thread_num);
void RequestPowerNoBindMode(const int thread_num);
void RequestPowerRandHighMode(const int shift_num, const int thread_num);
void RequestPowerRandLowMode(const int shift_num, const int thread_num);
#endif // LITE_WITH_LINUX
DeviceInfo() = default;
};
#endif // LITE_WITH_ARM
......
......@@ -28,8 +28,6 @@ void ConvCompute::PrepareForRun() {
auto o_dims = param.output->dims();
auto& ctx = this->ctx_->template As<ARMContext>();
// TODO(xxx): make api and expose it
ctx.SetRunMode(LITE_POWER_HIGH, 4);
int win = x_dims[3]; // nchw
int hin = x_dims[2];
......
......@@ -28,7 +28,6 @@ void FcCompute::PrepareForRun() {
auto w_dims = param.w->dims();
auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
CHECK_GE(x_dims.size(), 2UL);
CHECK_EQ(w_dims.size(), 2UL);
......
......@@ -24,7 +24,6 @@ namespace arm {
void MulCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
}
void MulCompute::Run() {
......
......@@ -26,7 +26,6 @@ namespace arm {
void PoolCompute::PrepareForRun() {
auto& ctx = this->ctx_->template As<ARMContext>();
ctx.SetRunMode(LITE_POWER_HIGH, 4);
}
void PoolCompute::Run() {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册