diff --git a/mace/core/runtime/cpu/cpu_runtime.cc b/mace/core/runtime/cpu/cpu_runtime.cc index 191e6c35368619a5ba4671b59362a2043ebdf892..6880dc47145bc761037b3f21be4f4791f4b26613 100644 --- a/mace/core/runtime/cpu/cpu_runtime.cc +++ b/mace/core/runtime/cpu/cpu_runtime.cc @@ -104,12 +104,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy( std::vector cores_to_use; MACE_RETURN_IF_ERROR( mace::utils::GetCPUCoresToUse( - cpu_max_freqs, policy, num_threads_hint, &cores_to_use)); - - int cpu_count = static_cast(cores_to_use.size()); - if (num_threads_hint <= 0 || num_threads_hint > cpu_count) { - num_threads_hint = cpu_count; - } + cpu_max_freqs, policy, &num_threads_hint, &cores_to_use)); if (policy == CPUAffinityPolicy::AFFINITY_NONE) { #ifdef MACE_ENABLE_QUANTIZE diff --git a/mace/utils/thread_pool.cc b/mace/utils/thread_pool.cc index c6596e5c52b991c1f198d93ba369b8d10e5a7d62..da8088752712daa82237ea96973673d46274257a 100644 --- a/mace/utils/thread_pool.cc +++ b/mace/utils/thread_pool.cc @@ -46,20 +46,22 @@ struct CPUFreq { float freq; }; -size_t GetCpuCoresForPerfomance(const std::vector &cpu_freqs) { +int GetCpuCoresForPerfomance( + const std::vector &cpu_freqs, + const std::function &comp) { float total_freq = std::accumulate(cpu_freqs.begin(), cpu_freqs.end(), 0, [](float accum, CPUFreq cpu_freq) { return accum + cpu_freq.freq; }); - size_t valid_cpu_nums = std::count_if(cpu_freqs.begin(), cpu_freqs.end(), + int64_t valid_cpu_nums = std::count_if(cpu_freqs.begin(), cpu_freqs.end(), [](CPUFreq cpu_freq) { return cpu_freq.freq != 0; }); float avg_freq = total_freq / valid_cpu_nums; - size_t cores_to_use = 0; + int cores_to_use = 0; for (auto cpu_info : cpu_freqs) { - if ((cpu_info.freq > avg_freq + if ((comp(cpu_info.freq, avg_freq) && cores_to_use < kMaxCpuCoresForPerformance) || cores_to_use < kMinCpuCoresForPerformance) { ++cores_to_use; @@ -73,16 +75,17 @@ size_t GetCpuCoresForPerfomance(const std::vector &cpu_freqs) { MaceStatus GetCPUCoresToUse(const std::vector &cpu_max_freqs, const CPUAffinityPolicy policy, - const size_t thread_count_hint, + int *thread_count, std::vector *cores) { if (cpu_max_freqs.empty()) { + *thread_count = 1; LOG(ERROR) << "CPU core is empty"; return MaceStatus::MACE_RUNTIME_ERROR; } - size_t thread_count = thread_count_hint; - const size_t cpu_count = cpu_max_freqs.size(); - if (thread_count == 0 || thread_count > cpu_count) { - thread_count = cpu_count; + *thread_count = std::max(*thread_count, 0); + const int cpu_count = static_cast(cpu_max_freqs.size()); + if (*thread_count == 0 || *thread_count > cpu_count) { + *thread_count = cpu_count; } if (policy != CPUAffinityPolicy::AFFINITY_NONE) { @@ -108,69 +111,78 @@ MaceStatus GetCPUCoresToUse(const std::vector &cpu_max_freqs, } // decide num of cores to use - size_t cores_to_use = 0; - if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY - || policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { - cores_to_use = GetCpuCoresForPerfomance(cpu_freq); + int cores_to_use = 0; + if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) { + cores_to_use = + GetCpuCoresForPerfomance(cpu_freq, std::greater_equal()); + } else if (policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) { + cores_to_use = + GetCpuCoresForPerfomance(cpu_freq, std::less_equal()); } else { - cores_to_use = thread_count; + cores_to_use = *thread_count; } MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0"); - cores->resize(cores_to_use); - for (size_t i = 0; i < cores_to_use; ++i) { + cores->resize(static_cast(cores_to_use)); + for (int i = 0; i < cores_to_use; ++i) { VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq " << cpu_freq[i].freq; (*cores)[i] = static_cast(cpu_freq[i].core_id); } + if (*thread_count == 0 || *thread_count > cores_to_use) { + *thread_count = cores_to_use; + } } return MaceStatus::MACE_SUCCESS; } -ThreadPool::ThreadPool(const size_t thread_count_hint, +ThreadPool::ThreadPool(const int thread_count_hint, const CPUAffinityPolicy policy) : event_(kThreadPoolNone), count_down_latch_(kThreadPoolSpinWaitTime) { - size_t thread_count = thread_count_hint; + int thread_count = thread_count_hint; - std::vector cpu_max_freqs; - if (port::Env::Default()->GetCPUMaxFreq(&cpu_max_freqs) + if (port::Env::Default()->GetCPUMaxFreq(&cpu_max_freqs_) != MaceStatus::MACE_SUCCESS) { LOG(ERROR) << "Fail to get cpu max frequencies"; } - thread_count = std::max(static_cast(1), - std::min(thread_count, cpu_max_freqs.size())); - std::vector cores_to_use; - GetCPUCoresToUse(cpu_max_freqs, policy, thread_count, &cores_to_use); + GetCPUCoresToUse(cpu_max_freqs_, policy, &thread_count, &cores_to_use); + MACE_CHECK(thread_count > 0); + VLOG(2) << "Use " << thread_count << " threads"; + if (!cores_to_use.empty()) { if (port::Env::Default()->SchedSetAffinity(cores_to_use) != MaceStatus::MACE_SUCCESS) { LOG(ERROR) << "Failed to sched_set_affinity"; } } - if (!cores_to_use.empty() && thread_count > cores_to_use.size()) { - thread_count = cores_to_use.size(); - } - VLOG(2) << "Use " << thread_count << " threads"; default_tile_count_ = thread_count; - if (cores_to_use.size() >= 2 - && cpu_max_freqs[cores_to_use[0]] != cpu_max_freqs[cores_to_use.back()]) { + if (thread_count > 1) { default_tile_count_ = thread_count * kTileCountPerThread; } MACE_CHECK(default_tile_count_ > 0, "default tile count should > 0"); - threads_ = std::vector(thread_count); - thread_infos_ = std::vector(thread_count); + threads_ = std::vector(static_cast(thread_count)); + thread_infos_ = std::vector(static_cast(thread_count)); for (auto &thread_info : thread_infos_) { thread_info.cpu_cores = cores_to_use; } } ThreadPool::~ThreadPool() { + // Clear affinity of main thread + if (!cpu_max_freqs_.empty()) { + std::vector cores(cpu_max_freqs_.size()); + for (size_t i = 0; i < cores.size(); ++i) { + cores[i] = i; + } + port::Env::Default()->SchedSetAffinity(cores); + } + Destroy(); } diff --git a/mace/utils/thread_pool.h b/mace/utils/thread_pool.h index 5e77c8df1d05012d690ef7088d97439ce5d6e637..275038b0d8a6d3d62ab80807d7c2080fc5b1f2a0 100644 --- a/mace/utils/thread_pool.h +++ b/mace/utils/thread_pool.h @@ -31,12 +31,12 @@ namespace utils { MaceStatus GetCPUCoresToUse(const std::vector &cpu_max_freqs, const CPUAffinityPolicy policy, - const size_t thread_count_hint, + int *thread_count_hint, std::vector *cores); class ThreadPool { public: - ThreadPool(const size_t thread_count, + ThreadPool(const int thread_count, const CPUAffinityPolicy affinity_policy); ~ThreadPool(); @@ -114,6 +114,7 @@ class ThreadPool { }; std::vector thread_infos_; std::vector threads_; + std::vector cpu_max_freqs_; int64_t default_tile_count_; };