diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index ad60447e706613252affbac36d635a8f88193a71..191e6c35368619a5ba4671b59362a2043ebdf892 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -31,16 +31,12 @@ #include "mace/public/mace.h" #include "mace/utils/macros.h" #include "mace/utils/logging.h" +#include "mace/utils/thread_pool.h" namespace mace { int MaceOpenMPThreadCount = 1; -struct CPUFreq { - size_t core_id; - float freq; -}; - enum SchedulePolicy { SCHED_STATIC, SCHED_GUIDED, @@ -105,28 +101,12 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( return MaceStatus::MACE_RUNTIME_ERROR; } - std::vector cpu_freq(cpu_max_freqs.size()); - for (size_t i = 0; i < cpu_max_freqs.size(); ++i) { - cpu_freq[i].core_id = i; - cpu_freq[i].freq = cpu_max_freqs[i]; - } - if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE || - policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { - std::sort(cpu_freq.begin(), - cpu_freq.end(), - [=](const CPUFreq &lhs, const CPUFreq &rhs) { - return lhs.freq < rhs.freq; - }); - } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE || - policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { - std::sort(cpu_freq.begin(), - cpu_freq.end(), - [](const CPUFreq &lhs, const CPUFreq &rhs) { - return lhs.freq > rhs.freq; - }); - } + std::vector cores_to_use; + MACE_RETURN_IF_ERROR( + mace::utils::GetCPUCoresToUse( + cpu_max_freqs, policy, num_threads_hint, &cores_to_use)); - int cpu_count = static_cast(cpu_freq.size()); + int cpu_count = static_cast(cores_to_use.size()); if (num_threads_hint <= 0 || num_threads_hint > cpu_count) { num_threads_hint = cpu_count; } @@ -148,32 +128,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( return MaceStatus::MACE_SUCCESS; } - - // decide num of cores to use - int cores_to_use = 0; - if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY - || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { - for (size_t i = 0; i < cpu_max_freqs.size(); ++i) { - if (cpu_freq[i].freq != cpu_freq[0].freq) { - break; - } - ++cores_to_use; - } - num_threads_hint = std::min(num_threads_hint, cores_to_use); - } else { - cores_to_use = num_threads_hint; - } - MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0"); - - VLOG(2) << "Use " << num_threads_hint << " threads"; - std::vector cpu_ids(cores_to_use); - for (int i = 0; i < cores_to_use; ++i) { - VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq " - << cpu_freq[i].freq; - cpu_ids[i] = cpu_freq[i].core_id; - } SchedulePolicy sched_policy = SCHED_GUIDED; - if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) { + float first_freq = cpu_max_freqs[cores_to_use[0]]; + float last_freq = cpu_max_freqs[cores_to_use[cores_to_use.size() - 1]]; + if (std::abs(first_freq - last_freq) < 1e-6) { sched_policy = SCHED_STATIC; } @@ -185,7 +143,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( #endif // MACE_ENABLE_QUANTIZE return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint, - cpu_ids, + cores_to_use, sched_policy); } diff --git a/mace/port/android/env.cc b/mace/port/android/env.cc index fa338f078afef4ba6dbf5bb9930e554aab2b8292..a247cea0b7ff052d6b306487379e691a82fc2b27 100644 --- a/mace/port/android/env.cc +++ b/mace/port/android/env.cc @@ -17,7 +17,6 @@ #include #include #include -#include #include #include @@ -50,46 +49,60 @@ LogWriter *AndroidEnv::GetLogWriter() { namespace { struct BacktraceState { - void** current; - void** end; + void **current; + void **end; }; -_Unwind_Reason_Code UnwindCallback(struct _Unwind_Context* context, void* arg) { - BacktraceState* state = static_cast(arg); +_Unwind_Reason_Code UnwindCallback(struct _Unwind_Context *context, void *arg) { + BacktraceState *state = static_cast(arg); uintptr_t pc = _Unwind_GetIP(context); if (pc) { if (state->current == state->end) { return _URC_END_OF_STACK; } else { - *state->current++ = reinterpret_cast(pc); + *state->current++ = reinterpret_cast(pc); } } return _URC_NO_REASON; } -size_t BackTrace(void** buffer, size_t max) { +size_t BackTrace(void **buffer, size_t max) { BacktraceState state = {buffer, buffer + max}; _Unwind_Backtrace(UnwindCallback, &state); return state.current - buffer; } +bool CpuIsolate(size_t cpu_id) { + std::string cpuinfo_isolate_conf = MakeString( + "/sys/devices/system/cpu/cpu", + cpu_id, + "/isolate"); + std::ifstream isolate_file(cpuinfo_isolate_conf); + int isolate_switch = 0; + if (isolate_file.is_open()) { + std::string line; + if (std::getline(isolate_file, line)) { + isolate_switch = strtol(line.c_str(), nullptr, 0); + } + isolate_file.close(); + } + + return (isolate_switch != 0); +} + } // namespace -MaceStatus AndroidEnv::SchedSetAffinity(const std::vector &cpu_ids) { - // compute mask - cpu_set_t mask; - CPU_ZERO(&mask); - for (auto cpu_id : cpu_ids) { - CPU_SET(cpu_id, &mask); - } - pid_t pid = gettid(); - int err = sched_setaffinity(pid, sizeof(mask), &mask); - if (err) { - LOG(WARNING) << "SchedSetAffinity failed: " << strerror(errno); - return MaceStatus(MaceStatus::MACE_INVALID_ARGS, - "SchedSetAffinity failed: " + - std::string(strerror(errno))); +MaceStatus AndroidEnv::GetCPUMaxFreq(std::vector *max_freqs) { + MACE_RETURN_IF_ERROR(LinuxBaseEnv::GetCPUMaxFreq(max_freqs)); + + size_t cpu_num = (max_freqs != nullptr) ? max_freqs->size() : 0; + if (cpu_num > 0) { + for (size_t i = 0; i < cpu_num; ++i) { + if (CpuIsolate(i)) { + (*max_freqs)[i] = 0; + } + } } return MaceStatus::MACE_SUCCESS; @@ -103,8 +116,8 @@ std::vector AndroidEnv::GetBackTraceUnsafe(int max_steps) { for (int i = 0; i < steps; ++i) { std::ostringstream os; - const void* addr = buffer[i]; - const char* symbol = ""; + const void *addr = buffer[i]; + const char *symbol = ""; Dl_info info; if (dladdr(addr, &info) && info.dli_sname) { symbol = info.dli_sname; diff --git a/mace/port/android/env.h b/mace/port/android/env.h index 071340367bf39a03b65837eaea68f105852fce2f..39d16d95468e1b26c8983f1b6700bccb4834ceea 100644 --- a/mace/port/android/env.h +++ b/mace/port/android/env.h @@ -29,8 +29,8 @@ namespace port { class AndroidEnv : public LinuxBaseEnv { public: - MaceStatus SchedSetAffinity(const std::vector &cpu_ids) override; LogWriter *GetLogWriter() override; + MaceStatus GetCPUMaxFreq(std::vector *max_freqs) override; std::vector GetBackTraceUnsafe(int max_steps) override; std::unique_ptr NewMallocLogger( std::ostringstream *oss, diff --git a/mace/port/darwin/env.cc b/mace/port/darwin/env.cc index 3344adbbc487b6bedbd745157c205ab6680ddfb0..2e4a3694ac32ab97d6fb80f61c28c51c52dc7abf 100644 --- a/mace/port/darwin/env.cc +++ b/mace/port/darwin/env.cc @@ -15,6 +15,8 @@ #include "mace/port/darwin/env.h" #include +#include +#include #include #include #include @@ -33,27 +35,64 @@ namespace mace { namespace port { namespace { -const char kCpuFrequencyMax[] = "hw.cpufrequency_max"; + +constexpr const char kCpuFrequencyMax[] = "hw.cpufrequency_max"; +constexpr const char kCpuActiveNum[] = "hw.activecpu"; + } int64_t DarwinEnv::NowMicros() { return mace::port::posix::NowMicros(); } -// TODO(luxuhui): this func is not accurate, darwin does not support -// acquiring CPU frequencies, we need to reconsider the CPU scheduling -// strategy. -MaceStatus DarwinEnv::GetCPUMaxFreq(std::vector *max_freqs) { - MACE_CHECK_NOTNULL(max_freqs); +// we can't get the frequancy of every cpu on darwin, so this method +// return a fake frequancy data. +MaceStatus DarwinEnv::GetCPUMaxFreq(std::vector *cpu_infos) { + MACE_CHECK_NOTNULL(cpu_infos); - uint64_t freq = 0; + float freq = 0; size_t size = sizeof(freq); int ret = sysctlbyname(kCpuFrequencyMax, &freq, &size, NULL, 0); if (ret < 0) { LOG(ERROR) << "failed to get property: " << kCpuFrequencyMax; return MaceStatus::MACE_RUNTIME_ERROR; } - max_freqs->push_back(freq); + + uint64_t cpu_num = 0; + size = sizeof(cpu_num); + ret = sysctlbyname(kCpuActiveNum, &cpu_num, &size, NULL, 0); + if (ret < 0) { + LOG(ERROR) << "failed to get property: " << kCpuActiveNum; + return MaceStatus::MACE_RUNTIME_ERROR; + } + + for (int i = 0; i < cpu_num; ++i) { + cpu_infos->push_back(freq); + } + + return MaceStatus::MACE_SUCCESS; +} + +MaceStatus DarwinEnv::SchedSetAffinity( + const std::vector &cpu_ids) { + unsigned int tag = 0; + for (size_t i = 0; i < cpu_ids.size(); ++i) { + tag += (cpu_ids[i] << i); + } + +#ifdef MACE_OS_MAC + pthread_t thread = pthread_self(); + mach_port_t mach_port = pthread_mach_thread_np(thread); + thread_affinity_policy_data_t policy_data = {(integer_t) tag}; + int ret = thread_policy_set(mach_port, + THREAD_AFFINITY_POLICY, + (thread_policy_t) & policy_data, + 1); + if (ret) { + LOG(INFO) << "thread_policy_set failed: " << strerror(errno); + return MaceStatus::MACE_RUNTIME_ERROR; + } +#endif return MaceStatus::MACE_SUCCESS; } diff --git a/mace/port/darwin/env.h b/mace/port/darwin/env.h index 7205bb7fa97fd020a294198d45b47114f6ee4873..d709af6a6696de7b3a86c4cc71a2c67fb72ea484 100644 --- a/mace/port/darwin/env.h +++ b/mace/port/darwin/env.h @@ -20,6 +20,7 @@ #include "mace/port/env.h" #include "mace/port/logger.h" +#include "mace/port/port-arch.h" #include "mace/port/posix/file_system.h" namespace mace { @@ -29,6 +30,7 @@ class DarwinEnv : public Env { public: int64_t NowMicros() override; MaceStatus GetCPUMaxFreq(std::vector *max_freqs) override; + MaceStatus SchedSetAffinity(const std::vector &cpu_ids) override; FileSystem *GetFileSystem() override; LogWriter *GetLogWriter() override; std::vector GetBackTraceUnsafe(int max_steps) override; diff --git a/mace/port/linux/env.cc b/mace/port/linux/env.cc index 00831c5ed89c11a1163c57a1a83a6bdbdd386f62..6e534516db6940b00f9b57910dbd5523f0cf8be8 100644 --- a/mace/port/linux/env.cc +++ b/mace/port/linux/env.cc @@ -25,10 +25,21 @@ #include "mace/port/posix/backtrace.h" #include "mace/port/posix/file_system.h" #include "mace/port/posix/time.h" +#include "mace/utils/macros.h" namespace mace { namespace port { +// In our embedded linux device, SchedSetAffinity has side effects +// on performance, so we override this method to do nothing. You +// can try to comment this function, perhaps you could get a better +// performance as we do in Android devices. +MaceStatus LinuxEnv::SchedSetAffinity(const std::vector &cpu_ids) { + MACE_UNUSED(cpu_ids); + + return MaceStatus::MACE_SUCCESS; +} + LogWriter *LinuxEnv::GetLogWriter() { return &log_writer_; } diff --git a/mace/port/linux/env.h b/mace/port/linux/env.h index 825dd29d9afe11fe1fd234ad1e1ba888381a403d..9e2dc517a3820d8c993d8272cc6eab72647f24df 100644 --- a/mace/port/linux/env.h +++ b/mace/port/linux/env.h @@ -26,6 +26,7 @@ namespace port { class LinuxEnv : public LinuxBaseEnv { public: + MaceStatus SchedSetAffinity(const std::vector &cpu_ids) override; LogWriter *GetLogWriter() override; std::vector GetBackTraceUnsafe(int max_steps) override; diff --git a/mace/port/linux_base/env.cc b/mace/port/linux_base/env.cc index 335e0e31b60f8a70afd3666b5dd04d3118458c7a..10b946ac62de806ddc7f1f1cf113530b6e4d1924 100644 --- a/mace/port/linux_base/env.cc +++ b/mace/port/linux_base/env.cc @@ -14,7 +14,10 @@ #include "mace/port/linux_base/env.h" +#include +#include #include +#include #include #include @@ -28,7 +31,6 @@ namespace mace { namespace port { - namespace { int GetCPUCount() { @@ -100,5 +102,24 @@ MaceStatus LinuxBaseEnv::GetCPUMaxFreq(std::vector *max_freqs) { return MaceStatus::MACE_SUCCESS; } +MaceStatus LinuxBaseEnv::SchedSetAffinity(const std::vector &cpu_ids) { + cpu_set_t mask; + CPU_ZERO(&mask); + for (auto cpu_id : cpu_ids) { + CPU_SET(cpu_id, &mask); + } + + pid_t pid = syscall(SYS_gettid); + int err = sched_setaffinity(pid, sizeof(mask), &mask); + if (err) { + LOG(WARNING) << "SchedSetAffinity failed: " << strerror(errno); + return MaceStatus(MaceStatus::MACE_INVALID_ARGS, + "SchedSetAffinity failed: " + + std::string(strerror(errno))); + } + + return MaceStatus::MACE_SUCCESS; +} + } // namespace port } // namespace mace diff --git a/mace/port/linux_base/env.h b/mace/port/linux_base/env.h index 07270f2a7b3eaef3997f5a94e87a218fa5b64ca0..7ef0e9fcd3149cb681b4d8ccafe0ecf9dee7bc2a 100644 --- a/mace/port/linux_base/env.h +++ b/mace/port/linux_base/env.h @@ -28,6 +28,7 @@ class LinuxBaseEnv : public Env { int64_t NowMicros() override; MaceStatus GetCPUMaxFreq(std::vector *max_freqs) override; FileSystem *GetFileSystem() override; + MaceStatus SchedSetAffinity(const std::vector &cpu_ids) override; protected: PosixFileSystem posix_file_system_; diff --git a/mace/utils/thread_pool.cc b/mace/utils/thread_pool.cc index 8cbdbf140e65c4aee6c22378b6996d8616c7ee24..5fa3ad6e419b8f597c803d173bf9ec47b94c6fd2 100644 --- a/mace/utils/thread_pool.cc +++ b/mace/utils/thread_pool.cc @@ -13,6 +13,8 @@ // limitations under the License. #include +#include + #include "mace/port/port.h" #include "mace/port/env.h" #include "mace/utils/logging.h" @@ -26,6 +28,8 @@ namespace utils { constexpr int kThreadPoolSpinWaitTime = 2000000; // ns constexpr int kTileCountPerThread = 2; constexpr int kMaxCostUsingSingleThread = 100; +constexpr int kMinCpuCoresForPerformance = 3; +constexpr int kMaxCpuCoresForPerformance = 5; namespace { @@ -42,67 +46,87 @@ struct CPUFreq { float freq; }; -void GetCPUCoresToUse(const std::vector &cpu_max_freqs, - const CPUAffinityPolicy policy, - const size_t thread_count_hint, - std::vector *cores) { - size_t thread_count = thread_count_hint; - if (!cpu_max_freqs.empty()) { - const size_t cpu_count = cpu_max_freqs.size(); - if (thread_count == 0 || thread_count > cpu_count) { - thread_count = cpu_count; +size_t GetCpuCoresForPerfomance(const std::vector &cpu_freqs) { + float total_freq = std::accumulate(cpu_freqs.begin(), cpu_freqs.end(), 0, + [](float accum, CPUFreq cpu_freq) { + return accum + cpu_freq.freq; + }); + size_t valid_cpu_nums = std::count_if(cpu_freqs.begin(), cpu_freqs.end(), + [](CPUFreq cpu_freq) { + return cpu_freq.freq != 0; + }); + float avg_freq = total_freq / valid_cpu_nums; + + size_t cores_to_use = 0; + for (auto cpu_info : cpu_freqs) { + if ((cpu_info.freq > avg_freq + && cores_to_use < kMaxCpuCoresForPerformance) + || cores_to_use < kMinCpuCoresForPerformance) { + ++cores_to_use; } + } - if (policy != CPUAffinityPolicy::AFFINITY_NONE) { - std::vector cpu_freq(cpu_max_freqs.size()); - for (size_t i = 0; i < cpu_max_freqs.size(); ++i) { - cpu_freq[i].core_id = i; - cpu_freq[i].freq = cpu_max_freqs[i]; - } - if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE || - policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { - std::sort(cpu_freq.begin(), - cpu_freq.end(), - [=](const CPUFreq &lhs, const CPUFreq &rhs) { - return lhs.freq < rhs.freq; - }); - } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE || - policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { - std::sort(cpu_freq.begin(), - cpu_freq.end(), - [](const CPUFreq &lhs, const CPUFreq &rhs) { - return lhs.freq > rhs.freq; - }); - } + return cores_to_use; +} - // decide num of cores to use - size_t cores_to_use = 0; - if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY - || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { - for (size_t i = 0; i < cpu_max_freqs.size(); ++i) { - if (cpu_freq[i].freq != cpu_freq[0].freq) { - break; - } - ++cores_to_use; - } - } else { - cores_to_use = thread_count; - } - MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0"); - cores->resize(cores_to_use); - for (size_t i = 0; i < cores_to_use; ++i) { - VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id - << " with freq " - << cpu_freq[i].freq; - (*cores)[i] = static_cast(cpu_freq[i].core_id); - } - } - } else { +} // namespace + +MaceStatus GetCPUCoresToUse(const std::vector &cpu_max_freqs, + const CPUAffinityPolicy policy, + const size_t thread_count_hint, + std::vector *cores) { + if (cpu_max_freqs.empty()) { LOG(ERROR) << "CPU core is empty"; + return MaceStatus::MACE_RUNTIME_ERROR; + } + size_t thread_count = thread_count_hint; + const size_t cpu_count = cpu_max_freqs.size(); + if (thread_count == 0 || thread_count > cpu_count) { + thread_count = cpu_count; } -} -} // namespace + if (policy != CPUAffinityPolicy::AFFINITY_NONE) { + std::vector cpu_freq(cpu_max_freqs.size()); + for (size_t i = 0; i < cpu_max_freqs.size(); ++i) { + cpu_freq[i].core_id = i; + cpu_freq[i].freq = cpu_max_freqs[i]; + } + if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE || + policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { + std::sort(cpu_freq.begin(), + cpu_freq.end(), + [=](const CPUFreq &lhs, const CPUFreq &rhs) { + return lhs.freq < rhs.freq; + }); + } else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE || + policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { + std::sort(cpu_freq.begin(), + cpu_freq.end(), + [](const CPUFreq &lhs, const CPUFreq &rhs) { + return lhs.freq > rhs.freq; + }); + } + + // decide num of cores to use + size_t cores_to_use = 0; + if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY + || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { + cores_to_use = GetCpuCoresForPerfomance(cpu_freq); + } else { + cores_to_use = thread_count; + } + MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0"); + cores->resize(cores_to_use); + for (size_t i = 0; i < cores_to_use; ++i) { + VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id + << " with freq " + << cpu_freq[i].freq; + (*cores)[i] = static_cast(cpu_freq[i].core_id); + } + } + + return MaceStatus::MACE_SUCCESS; +} ThreadPool::ThreadPool(const size_t thread_count_hint, const CPUAffinityPolicy policy) @@ -173,13 +197,13 @@ void ThreadPool::Run(const std::function &func, std::unique_lock run_lock(run_mutex_); for (size_t i = 0; i < thread_count; ++i) { - int64_t count = iters_per_thread + (static_cast(i) < remainder); + int64_t range_len = + iters_per_thread + (static_cast(i) < remainder); thread_infos_[i].range_start = iters_offset; - int64_t range_end = iters_offset + count; - thread_infos_[i].range_end = range_end; - thread_infos_[i].range_len = range_end - iters_offset; + thread_infos_[i].range_len = range_len; + thread_infos_[i].range_end = iters_offset + range_len; thread_infos_[i].func = reinterpret_cast(&func); - iters_offset += thread_infos_[i].range_len; + iters_offset = thread_infos_[i].range_end; } count_down_latch_.Reset(thread_count - 1); diff --git a/mace/utils/thread_pool.h b/mace/utils/thread_pool.h index 90d30257bf66da0b7d6d82776b87071779396b9f..5e77c8df1d05012d690ef7088d97439ce5d6e637 100644 --- a/mace/utils/thread_pool.h +++ b/mace/utils/thread_pool.h @@ -29,6 +29,11 @@ namespace mace { namespace utils { +MaceStatus GetCPUCoresToUse(const std::vector &cpu_max_freqs, + const CPUAffinityPolicy policy, + const size_t thread_count_hint, + std::vector *cores); + class ThreadPool { public: ThreadPool(const size_t thread_count,