diff --git a/mace/benchmark/benchmark_model.cc b/mace/benchmark/benchmark_model.cc index e58de0d317869944b13ba597e7f691f63d6bcda3..e5de57b7ddfe414579d961268a08ca1d35b2e549 100644 --- a/mace/benchmark/benchmark_model.cc +++ b/mace/benchmark/benchmark_model.cc @@ -215,11 +215,11 @@ DEFINE_bool(show_flops, true, "whether to estimate the model's FLOPs"); DEFINE_int32(warmup_runs, 1, "how many runs to initialize model"); DEFINE_string(model_data_file, "", "model data file name, used when EMBED_MODEL_DATA set to 0"); -DEFINE_int32(gpu_perf_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(gpu_priority_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(omp_num_threads, 4, "num of openmp threads"); -DEFINE_int32(cpu_power_option, 0, - "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); +DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); +DEFINE_int32(cpu_affinity_policy, 1, + "0:AFFINITY_DEFAULT/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); int Main(int argc, char **argv) { MACE_CHECK(FLAGS_device != "HEXAGON", @@ -232,7 +232,7 @@ int Main(int argc, char **argv) { LOG(INFO) << "gpu_perf_hint: [" << FLAGS_gpu_perf_hint << "]"; LOG(INFO) << "gpu_priority_hint: [" << FLAGS_gpu_priority_hint << "]"; LOG(INFO) << "omp_num_threads: [" << FLAGS_omp_num_threads << "]"; - LOG(INFO) << "cpu_power_option: [" << FLAGS_cpu_power_option << "]"; + LOG(INFO) << "cpu_affinity_policy: [" << FLAGS_cpu_affinity_policy << "]"; LOG(INFO) << "Input node: [" << FLAGS_input_node<< "]"; LOG(INFO) << "Input shapes: [" << FLAGS_input_shape << "]"; LOG(INFO) << "Output node: [" << FLAGS_output_node<< "]"; @@ -267,11 +267,11 @@ int Main(int argc, char **argv) { mace::DeviceType device_type = ParseDeviceType(FLAGS_device); // config runtime - mace::ConfigOmpThreads(FLAGS_omp_num_threads); - mace::ConfigCPUPowerOption( - static_cast(FLAGS_cpu_power_option)); - if (device_type == OPENCL) { - mace::ConfigOpenCLRuntime( + mace::SetOpenMPThreadPolicy( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_affinity_policy)); + if (device_type == DeviceType::OPENCL) { + mace::SetGPUHints( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); } diff --git a/mace/core/mace_runtime.cc b/mace/core/mace_runtime.cc index 5f94195370990480c0003836cbde8d2a529c70c3..4e7c835cd5056ac837ea6834df884e0b360c3557 100644 --- a/mace/core/mace_runtime.cc +++ b/mace/core/mace_runtime.cc @@ -10,26 +10,31 @@ namespace mace { std::shared_ptr kStorageFactory = nullptr; -void ConfigOpenCLRuntime(GPUPerfHint gpu_perf_hint, - GPUPriorityHint gpu_priority_hint) { +void SetGPUHints(GPUPerfHint gpu_perf_hint, GPUPriorityHint gpu_priority_hint) { VLOG(1) << "Set GPU configurations, gpu_perf_hint: " << gpu_perf_hint << ", gpu_priority_hint: " << gpu_priority_hint; OpenCLRuntime::Configure(gpu_perf_hint, gpu_priority_hint); } -void ConfigKVStorageFactory(std::shared_ptr storage_factory) { +void SetKVStorageFactory(std::shared_ptr storage_factory) { VLOG(1) << "Set internal KV Storage Engine"; kStorageFactory = storage_factory; } -void ConfigOmpThreads(int omp_num_threads) { - VLOG(1) << "Config CPU omp_num_threads: " << omp_num_threads; - SetOmpThreads(omp_num_threads); +MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, + CPUAffinityPolicy policy) { + VLOG(1) << "Set CPU openmp num_threads_hint: " << num_threads_hint + << ", affinity policy: " << policy; + return SetOpenMPThreadsAndAffinityPolicy(num_threads_hint, policy); } -void ConfigCPUPowerOption(CPUPowerOption power_option) { - VLOG(1) << "Config CPU power option" << power_option; - SetThreadsAffinity(power_option); +void SetOpenMPThreadAffinity(int num_threads, const std::vector &cpu_ids) { + return SetOpenMPThreadsAndAffinityCPUs(num_threads, cpu_ids); +} + +MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, + std::vector *little_core_ids) { + return GetCPUBigLittleCoreIDs(big_core_ids, little_core_ids); } }; // namespace mace diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 5a1538b0c995f585cc67e52bc5558db02e42410a..b1bd9fe932b5e27204ef62881a5fd94d953a0583 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -2,12 +2,14 @@ // Copyright (c) 2017 XiaoMi All rights reserved. // +#include "mace/core/runtime/cpu/cpu_runtime.h" + #include -#include #include +#include +#include #include -#include "mace/core/runtime/cpu/cpu_runtime.h" #include "mace/public/mace.h" #include "mace/utils/logging.h" namespace mace { @@ -76,54 +78,98 @@ void SetThreadAffinity(cpu_set_t mask) { } // namespace -void SetOmpThreads(int omp_num_threads) { +MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, + std::vector *little_core_ids) { + MACE_CHECK_NOTNULL(big_core_ids); + MACE_CHECK_NOTNULL(little_core_ids); int cpu_count = omp_get_num_procs(); - if (omp_num_threads > cpu_count) { - LOG(WARNING) << "set omp num threads greater than num of cpus can use: " - << cpu_count; - } - omp_set_num_threads(omp_num_threads); -} + std::vector cpu_max_freq(cpu_count); + std::vector cpu_ids(cpu_count); -void SetThreadsAffinity(CPUPowerOption power_option) { - // There is no need to set affinity in default mode - if (power_option == CPUPowerOption::DEFAULT) { - return; + // set cpu max frequency + for (int i = 0; i < cpu_count; ++i) { + cpu_max_freq[i] = GetCPUMaxFreq(i); + if (cpu_max_freq[i] == 0) { + LOG(WARNING) << "Cannot get cpu" << i + << "'s max frequency info, maybe it is offline."; + return MACE_INVALID_ARGS; + } + cpu_ids[i] = i; } - int cpu_count = omp_get_num_procs(); - std::vector sorted_cpu_ids; - sorted_cpu_ids.resize(cpu_count); - int big_core_offset; - SortCPUIdsByMaxFreqAsc(&sorted_cpu_ids, &big_core_offset); + // sort cpu ids by max frequency asc + std::sort(cpu_ids.begin(), cpu_ids.end(), + [&cpu_max_freq](int a, int b) { + return cpu_max_freq[a] < cpu_max_freq[b]; + }); - std::vector use_cpu_ids; - if (power_option == CPUPowerOption::HIGH_PERFORMANCE) { - use_cpu_ids = std::vector(sorted_cpu_ids.begin() + big_core_offset, - sorted_cpu_ids.end()); - } else { - if (big_core_offset > 0) { - use_cpu_ids = std::vector(sorted_cpu_ids.begin(), - sorted_cpu_ids.begin() + big_core_offset); - } else { - use_cpu_ids = sorted_cpu_ids; + big_core_ids->reserve(cpu_count); + little_core_ids->reserve(cpu_count); + int little_core_freq = cpu_max_freq.front(); + int big_core_freq = cpu_max_freq.back(); + for (int i = 0; i < cpu_count; ++i) { + if (cpu_max_freq[i] == little_core_freq) { + little_core_ids->push_back(cpu_ids[i]); + } + if (cpu_max_freq[i] == big_core_freq) { + big_core_ids->push_back(cpu_ids[i]); } } + return MACE_SUCCESS; +} + +void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, + const std::vector &cpu_ids) { + std::ostringstream oss; + for (auto cpu_id : cpu_ids) oss << cpu_id << ' '; + VLOG(1) << "Set CPU openmp num_threads: " << omp_num_threads + << ", cpu_ids: " << oss.str(); + + omp_set_num_threads(omp_num_threads); + // compute mask cpu_set_t mask; CPU_ZERO(&mask); - for (auto cpu_id : use_cpu_ids) { + for (auto cpu_id : cpu_ids) { CPU_SET(cpu_id, &mask); } VLOG(3) << "Set cpu affinity with mask: " << mask.__bits[0]; - int omp_num_threads = omp_get_max_threads(); #pragma omp parallel for for (int i = 0; i < omp_num_threads; ++i) { SetThreadAffinity(mask); } } +MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, + CPUAffinityPolicy policy) { + // There is no need to set affinity in default mode + if (policy == CPUAffinityPolicy::AFFINITY_DEFAULT) { + if (omp_num_threads_hint > 0) omp_set_num_threads(omp_num_threads_hint); + return MACE_SUCCESS; + } + + std::vector big_core_ids; + std::vector little_core_ids; + MaceStatus res = GetCPUBigLittleCoreIDs(&big_core_ids, &little_core_ids); + if (res != MACE_SUCCESS) { + return res; + } + + std::vector use_cpu_ids; + if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { + use_cpu_ids = std::move(big_core_ids); + } else { + use_cpu_ids = std::move(little_core_ids); + } + + if (omp_num_threads_hint < 0) { + omp_num_threads_hint = use_cpu_ids.size(); + } + SetOpenMPThreadsAndAffinityCPUs(omp_num_threads_hint, use_cpu_ids); + return MACE_SUCCESS; +} + } // namespace mace diff --git a/mace/core/runtime/cpu/cpu_runtime.h b/mace/core/runtime/cpu/cpu_runtime.h index 4687319bff338dc2c287977e2506e854bafa5320..a66f81165057dd68b95b50fc0aae822da150f283 100644 --- a/mace/core/runtime/cpu/cpu_runtime.h +++ b/mace/core/runtime/cpu/cpu_runtime.h @@ -6,14 +6,22 @@ #ifndef MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ #define MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ +#include + +#include "mace/public/mace.h" #include "mace/public/mace_runtime.h" namespace mace { -void SetOmpThreads(int omp_num_threads); +MaceStatus GetCPUBigLittleCoreIDs(std::vector *big_core_ids, + std::vector *little_core_ids); + +void SetOpenMPThreadsAndAffinityCPUs(int omp_num_threads, + const std::vector &cpu_ids); -void SetThreadsAffinity(CPUPowerOption power_option); +MaceStatus SetOpenMPThreadsAndAffinityPolicy(int omp_num_threads_hint, + CPUAffinityPolicy policy); -} +} // namespace mace #endif // MACE_CORE_RUNTIME_CPU_CPU_RUNTIME_H_ diff --git a/mace/core/testing/test_benchmark_main.cc b/mace/core/testing/test_benchmark_main.cc index e5bc0e896f79ecd0e6a3a9e3de46461b57ad0094..f95f883197c02f5312414332d659c8b7ce5ec540 100644 --- a/mace/core/testing/test_benchmark_main.cc +++ b/mace/core/testing/test_benchmark_main.cc @@ -12,19 +12,19 @@ DEFINE_string(filter, "all", "op benchmark regex filter, eg:.*CONV.*"); DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(omp_num_threads, 1, "num of openmp threads"); -DEFINE_int32(cpu_power_option, 1, - "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); +DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); +DEFINE_int32(cpu_affinity_policy, 1, + "0:AFFINITY_DEFAULT/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); int main(int argc, char **argv) { gflags::SetUsageMessage("some usage message"); gflags::ParseCommandLineFlags(&argc, &argv, true); // config runtime - mace::ConfigOmpThreads(FLAGS_omp_num_threads); - mace::ConfigCPUPowerOption( - static_cast(FLAGS_cpu_power_option)); - mace::ConfigOpenCLRuntime( + mace::SetOpenMPThreadPolicy( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_affinity_policy)); + mace::SetGPUHints( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); diff --git a/mace/examples/example.cc b/mace/examples/example.cc index e70025d6c171087fad4393bc3620f728517ac564..62bace7eb4cad0fc3a24553a58dcc1b3a86f8f16 100644 --- a/mace/examples/example.cc +++ b/mace/examples/example.cc @@ -126,12 +126,11 @@ DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); DEFINE_int32(round, 1, "round"); DEFINE_int32(restart_round, 1, "restart round"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); -DEFINE_int32(gpu_perf_hint, 2, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(gpu_priority_hint, 1, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(omp_num_threads, 8, "num of openmp threads"); -DEFINE_int32(cpu_power_option, - 0, - "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); +DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); +DEFINE_int32(cpu_affinity_policy, 1, + "0:AFFINITY_DEFAULT/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); bool RunModel(const std::vector &input_names, const std::vector> &input_shapes, @@ -145,11 +144,11 @@ bool RunModel(const std::vector &input_names, DeviceType device_type = ParseDeviceType(FLAGS_device); // config runtime - mace::ConfigOmpThreads(FLAGS_omp_num_threads); - mace::ConfigCPUPowerOption( - static_cast(FLAGS_cpu_power_option)); + MaceStatus res = mace::SetOpenMPThreadPolicy( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_affinity_policy)); if (device_type == DeviceType::OPENCL) { - mace::ConfigOpenCLRuntime( + mace::SetGPUHints( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); } @@ -160,7 +159,7 @@ bool RunModel(const std::vector &input_names, // Config internal kv storage factory. std::shared_ptr storage_factory( new FileStorageFactory(kernel_file_path)); - ConfigKVStorageFactory(storage_factory); + SetKVStorageFactory(storage_factory); // Init model mace::MaceEngine engine(&net_def, device_type, input_names, output_names); @@ -249,7 +248,7 @@ int Main(int argc, char **argv) { LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint; LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint; LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads; - LOG(INFO) << "cpu_power_option: " << FLAGS_cpu_power_option; + LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy; std::vector input_names = str_util::Split(FLAGS_input_node, ','); std::vector output_names = diff --git a/mace/public/mace_runtime.h b/mace/public/mace_runtime.h index 8acb1cace385a876e97d62a2c2563c0bd13bd497..f159ea415d3fc20544b3e93bc4afad1870a09cfb 100644 --- a/mace/public/mace_runtime.h +++ b/mace/public/mace_runtime.h @@ -13,6 +13,8 @@ #include #include +#include "mace/public/mace.h" + namespace mace { enum GPUPerfHint { @@ -29,7 +31,11 @@ enum GPUPriorityHint { PRIORITY_HIGH = 3 }; -enum CPUPowerOption { DEFAULT = 0, HIGH_PERFORMANCE = 1, BATTERY_SAVE = 2 }; +enum CPUAffinityPolicy { + AFFINITY_DEFAULT = 0, + AFFINITY_BIG_ONLY = 1, + AFFINITY_LITTLE_ONLY = 2, +}; class KVStorage { public: @@ -60,12 +66,41 @@ class FileStorageFactory : public KVStorageFactory { std::unique_ptr impl_; }; -void ConfigKVStorageFactory(std::shared_ptr storage_factory); +// Set KV store factory used as OpenCL cache +void SetKVStorageFactory(std::shared_ptr storage_factory); -void ConfigOpenCLRuntime(GPUPerfHint, GPUPriorityHint); -void ConfigOmpThreads(int omp_num_threads); -void ConfigCPUPowerOption(CPUPowerOption power_option); +// Set GPU hints, currently only supports Adreno GPU +void SetGPUHints(GPUPerfHint perf_hint, GPUPriorityHint priority_hint); +// Set OpenMP threads number and affinity policy. +// +// num_threads_hint is only a hint, the function can change it when it's larger +// than 0. When num_threads_hint is not positive, the function will set the +// threads number equaling to the number of big + little, big or little cores +// according to the policy. +// +// This function may not work well on some ships (e.g. MTK), and in such +// cases (when it returns error MACE_INVALID_ARGS) you may try to use +// SetOpenMPThreadAffinity to set affinity manually, or just set default policy. +MaceStatus SetOpenMPThreadPolicy(int num_threads_hint, + CPUAffinityPolicy policy); + +// Set OpenMP threads number and processor affinity +// This function may not work well on some chips (e.g. MTK). Set thread affinity +// to offline cores may fail or run unexpectedly. In such cases, please use +// SetOpenMPThreadPolicy with default policy instead. +void SetOpenMPThreadAffinity(int num_threads, const std::vector &cpu_ids); + +// Get ARM big.LITTLE configuration. +// +// This function may not work well on some chips (e.g. MTK) and miss the +// offline cores, and the user should detect the configurations manually +// in such case(when it returns error MACE_INVALID_ARGS). +// +// If all cpu's frequencies are equal(i.e. all cores are the same), +// big_core_ids and little_core_ids will be set to all cpu ids. +MaceStatus GetBigLittleCoreIDs(std::vector *big_core_ids, + std::vector *little_core_ids); } // namespace mace diff --git a/mace/tools/validation/mace_run.cc b/mace/tools/validation/mace_run.cc index 8d83b03d59f9aa63700c5dadd026049515eff071..066ecd3f871e5cc03459bb6945450707b091ecf7 100644 --- a/mace/tools/validation/mace_run.cc +++ b/mace/tools/validation/mace_run.cc @@ -188,12 +188,11 @@ DEFINE_string(device, "OPENCL", "CPU/NEON/OPENCL/HEXAGON"); DEFINE_int32(round, 1, "round"); DEFINE_int32(restart_round, 1, "restart round"); DEFINE_int32(malloc_check_cycle, -1, "malloc debug check cycle, -1 to disable"); -DEFINE_int32(gpu_perf_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(gpu_priority_hint, 0, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); -DEFINE_int32(omp_num_threads, 4, "num of openmp threads"); -DEFINE_int32(cpu_power_option, - 0, - "0:DEFAULT/1:HIGH_PERFORMANCE/2:BATTERY_SAVE"); +DEFINE_int32(gpu_perf_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(gpu_priority_hint, 3, "0:DEFAULT/1:LOW/2:NORMAL/3:HIGH"); +DEFINE_int32(omp_num_threads, -1, "num of openmp threads"); +DEFINE_int32(cpu_affinity_policy, 1, + "0:AFFINITY_DEFAULT/1:AFFINITY_BIG_ONLY/2:AFFINITY_LITTLE_ONLY"); bool RunModel(const std::vector &input_names, const std::vector> &input_shapes, @@ -212,11 +211,11 @@ bool RunModel(const std::vector &input_names, LOG(INFO) << "Runing with device type: " << device_type; // config runtime - mace::ConfigOmpThreads(FLAGS_omp_num_threads); - mace::ConfigCPUPowerOption( - static_cast(FLAGS_cpu_power_option)); + mace::SetOpenMPThreadPolicy( + FLAGS_omp_num_threads, + static_cast(FLAGS_cpu_affinity_policy)); if (device_type == DeviceType::OPENCL) { - mace::ConfigOpenCLRuntime( + mace::SetGPUHints( static_cast(FLAGS_gpu_perf_hint), static_cast(FLAGS_gpu_priority_hint)); } @@ -230,7 +229,7 @@ bool RunModel(const std::vector &input_names, LOG(INFO) << "Run init"; std::shared_ptr storage_factory( new FileStorageFactory(kernel_file_path)); - ConfigKVStorageFactory(storage_factory); + SetKVStorageFactory(storage_factory); mace::MaceEngine engine(&net_def, device_type, input_names, output_names); if (device_type == DeviceType::OPENCL || device_type == DeviceType::HEXAGON) { mace::MACE_MODEL_TAG::UnloadModelData(model_data); @@ -350,7 +349,7 @@ int Main(int argc, char **argv) { LOG(INFO) << "gpu_perf_hint: " << FLAGS_gpu_perf_hint; LOG(INFO) << "gpu_priority_hint: " << FLAGS_gpu_priority_hint; LOG(INFO) << "omp_num_threads: " << FLAGS_omp_num_threads; - LOG(INFO) << "cpu_power_option: " << FLAGS_cpu_power_option; + LOG(INFO) << "cpu_affinity_policy: " << FLAGS_cpu_affinity_policy; std::vector input_names = str_util::Split(FLAGS_input_node, ','); std::vector output_names =