提交 8430a0e2 编写于 作者: L luxuhui

fix sched_setaffinity'bug and opt the performance of threadpool

N/A
Signed-off-by: NLuxuhui <luxuhui@xiaomi.com>
上级 6c178680
......@@ -31,16 +31,12 @@
#include "mace/public/mace.h"
#include "mace/utils/macros.h"
#include "mace/utils/logging.h"
#include "mace/utils/thread_pool.h"
namespace mace {
int MaceOpenMPThreadCount = 1;
struct CPUFreq {
size_t core_id;
float freq;
};
enum SchedulePolicy {
SCHED_STATIC,
SCHED_GUIDED,
......@@ -105,28 +101,12 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
return MaceStatus::MACE_RUNTIME_ERROR;
}
std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
cpu_freq[i].core_id = i;
cpu_freq[i].freq = cpu_max_freqs[i];
}
if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[=](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq < rhs.freq;
});
} else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq > rhs.freq;
});
}
std::vector<size_t> cores_to_use;
MACE_RETURN_IF_ERROR(
mace::utils::GetCPUCoresToUse(
cpu_max_freqs, policy, num_threads_hint, &cores_to_use));
int cpu_count = static_cast<int>(cpu_freq.size());
int cpu_count = static_cast<int>(cores_to_use.size());
if (num_threads_hint <= 0 || num_threads_hint > cpu_count) {
num_threads_hint = cpu_count;
}
......@@ -148,32 +128,10 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
return MaceStatus::MACE_SUCCESS;
}
// decide num of cores to use
int cores_to_use = 0;
if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
|| policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
if (cpu_freq[i].freq != cpu_freq[0].freq) {
break;
}
++cores_to_use;
}
num_threads_hint = std::min(num_threads_hint, cores_to_use);
} else {
cores_to_use = num_threads_hint;
}
MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
VLOG(2) << "Use " << num_threads_hint << " threads";
std::vector<size_t> cpu_ids(cores_to_use);
for (int i = 0; i < cores_to_use; ++i) {
VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id << " with freq "
<< cpu_freq[i].freq;
cpu_ids[i] = cpu_freq[i].core_id;
}
SchedulePolicy sched_policy = SCHED_GUIDED;
if (std::abs(cpu_freq[0].freq - cpu_freq[cores_to_use - 1].freq) < 1e-6) {
float first_freq = cpu_max_freqs[cores_to_use[0]];
float last_freq = cpu_max_freqs[cores_to_use[cores_to_use.size() - 1]];
if (std::abs(first_freq - last_freq) < 1e-6) {
sched_policy = SCHED_STATIC;
}
......@@ -185,7 +143,7 @@ MaceStatus CPURuntime::SetOpenMPThreadsAndAffinityPolicy(
#endif // MACE_ENABLE_QUANTIZE
return SetOpenMPThreadsAndAffinityCPUs(num_threads_hint,
cpu_ids,
cores_to_use,
sched_policy);
}
......
......@@ -17,7 +17,6 @@
#include <errno.h>
#include <unwind.h>
#include <dlfcn.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/types.h>
......@@ -50,46 +49,60 @@ LogWriter *AndroidEnv::GetLogWriter() {
namespace {
struct BacktraceState {
void** current;
void** end;
void **current;
void **end;
};
_Unwind_Reason_Code UnwindCallback(struct _Unwind_Context* context, void* arg) {
BacktraceState* state = static_cast<BacktraceState*>(arg);
_Unwind_Reason_Code UnwindCallback(struct _Unwind_Context *context, void *arg) {
BacktraceState *state = static_cast<BacktraceState *>(arg);
uintptr_t pc = _Unwind_GetIP(context);
if (pc) {
if (state->current == state->end) {
return _URC_END_OF_STACK;
} else {
*state->current++ = reinterpret_cast<void*>(pc);
*state->current++ = reinterpret_cast<void *>(pc);
}
}
return _URC_NO_REASON;
}
size_t BackTrace(void** buffer, size_t max) {
size_t BackTrace(void **buffer, size_t max) {
BacktraceState state = {buffer, buffer + max};
_Unwind_Backtrace(UnwindCallback, &state);
return state.current - buffer;
}
bool CpuIsolate(size_t cpu_id) {
std::string cpuinfo_isolate_conf = MakeString(
"/sys/devices/system/cpu/cpu",
cpu_id,
"/isolate");
std::ifstream isolate_file(cpuinfo_isolate_conf);
int isolate_switch = 0;
if (isolate_file.is_open()) {
std::string line;
if (std::getline(isolate_file, line)) {
isolate_switch = strtol(line.c_str(), nullptr, 0);
}
isolate_file.close();
}
return (isolate_switch != 0);
}
} // namespace
MaceStatus AndroidEnv::SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
// compute mask
cpu_set_t mask;
CPU_ZERO(&mask);
for (auto cpu_id : cpu_ids) {
CPU_SET(cpu_id, &mask);
}
pid_t pid = gettid();
int err = sched_setaffinity(pid, sizeof(mask), &mask);
if (err) {
LOG(WARNING) << "SchedSetAffinity failed: " << strerror(errno);
return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
"SchedSetAffinity failed: " +
std::string(strerror(errno)));
MaceStatus AndroidEnv::GetCPUMaxFreq(std::vector<float> *max_freqs) {
MACE_RETURN_IF_ERROR(LinuxBaseEnv::GetCPUMaxFreq(max_freqs));
size_t cpu_num = (max_freqs != nullptr) ? max_freqs->size() : 0;
if (cpu_num > 0) {
for (size_t i = 0; i < cpu_num; ++i) {
if (CpuIsolate(i)) {
(*max_freqs)[i] = 0;
}
}
}
return MaceStatus::MACE_SUCCESS;
......@@ -103,8 +116,8 @@ std::vector<std::string> AndroidEnv::GetBackTraceUnsafe(int max_steps) {
for (int i = 0; i < steps; ++i) {
std::ostringstream os;
const void* addr = buffer[i];
const char* symbol = "";
const void *addr = buffer[i];
const char *symbol = "";
Dl_info info;
if (dladdr(addr, &info) && info.dli_sname) {
symbol = info.dli_sname;
......
......@@ -29,8 +29,8 @@ namespace port {
class AndroidEnv : public LinuxBaseEnv {
public:
MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
LogWriter *GetLogWriter() override;
MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs) override;
std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
std::unique_ptr<MallocLogger> NewMallocLogger(
std::ostringstream *oss,
......
......@@ -15,6 +15,8 @@
#include "mace/port/darwin/env.h"
#include <execinfo.h>
#include <mach/thread_act.h>
#include <mach/thread_policy.h>
#include <stdint.h>
#include <sys/sysctl.h>
#include <sys/time.h>
......@@ -33,27 +35,64 @@ namespace mace {
namespace port {
namespace {
const char kCpuFrequencyMax[] = "hw.cpufrequency_max";
constexpr const char kCpuFrequencyMax[] = "hw.cpufrequency_max";
constexpr const char kCpuActiveNum[] = "hw.activecpu";
}
int64_t DarwinEnv::NowMicros() {
return mace::port::posix::NowMicros();
}
// TODO(luxuhui): this func is not accurate, darwin does not support
// acquiring CPU frequencies, we need to reconsider the CPU scheduling
// strategy.
MaceStatus DarwinEnv::GetCPUMaxFreq(std::vector<float> *max_freqs) {
MACE_CHECK_NOTNULL(max_freqs);
// we can't get the frequancy of every cpu on darwin, so this method
// return a fake frequancy data.
MaceStatus DarwinEnv::GetCPUMaxFreq(std::vector<float> *cpu_infos) {
MACE_CHECK_NOTNULL(cpu_infos);
uint64_t freq = 0;
float freq = 0;
size_t size = sizeof(freq);
int ret = sysctlbyname(kCpuFrequencyMax, &freq, &size, NULL, 0);
if (ret < 0) {
LOG(ERROR) << "failed to get property: " << kCpuFrequencyMax;
return MaceStatus::MACE_RUNTIME_ERROR;
}
max_freqs->push_back(freq);
uint64_t cpu_num = 0;
size = sizeof(cpu_num);
ret = sysctlbyname(kCpuActiveNum, &cpu_num, &size, NULL, 0);
if (ret < 0) {
LOG(ERROR) << "failed to get property: " << kCpuActiveNum;
return MaceStatus::MACE_RUNTIME_ERROR;
}
for (int i = 0; i < cpu_num; ++i) {
cpu_infos->push_back(freq);
}
return MaceStatus::MACE_SUCCESS;
}
MaceStatus DarwinEnv::SchedSetAffinity(
const std::vector<size_t> &cpu_ids) {
unsigned int tag = 0;
for (size_t i = 0; i < cpu_ids.size(); ++i) {
tag += (cpu_ids[i] << i);
}
#ifdef MACE_OS_MAC
pthread_t thread = pthread_self();
mach_port_t mach_port = pthread_mach_thread_np(thread);
thread_affinity_policy_data_t policy_data = {(integer_t) tag};
int ret = thread_policy_set(mach_port,
THREAD_AFFINITY_POLICY,
(thread_policy_t) & policy_data,
1);
if (ret) {
LOG(INFO) << "thread_policy_set failed: " << strerror(errno);
return MaceStatus::MACE_RUNTIME_ERROR;
}
#endif
return MaceStatus::MACE_SUCCESS;
}
......
......@@ -20,6 +20,7 @@
#include "mace/port/env.h"
#include "mace/port/logger.h"
#include "mace/port/port-arch.h"
#include "mace/port/posix/file_system.h"
namespace mace {
......@@ -29,6 +30,7 @@ class DarwinEnv : public Env {
public:
int64_t NowMicros() override;
MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs) override;
MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
FileSystem *GetFileSystem() override;
LogWriter *GetLogWriter() override;
std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
......
......@@ -25,10 +25,21 @@
#include "mace/port/posix/backtrace.h"
#include "mace/port/posix/file_system.h"
#include "mace/port/posix/time.h"
#include "mace/utils/macros.h"
namespace mace {
namespace port {
// In our embedded linux device, SchedSetAffinity has side effects
// on performance, so we override this method to do nothing. You
// can try to comment this function, perhaps you could get a better
// performance as we do in Android devices.
MaceStatus LinuxEnv::SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
MACE_UNUSED(cpu_ids);
return MaceStatus::MACE_SUCCESS;
}
LogWriter *LinuxEnv::GetLogWriter() {
return &log_writer_;
}
......
......@@ -26,6 +26,7 @@ namespace port {
class LinuxEnv : public LinuxBaseEnv {
public:
MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
LogWriter *GetLogWriter() override;
std::vector<std::string> GetBackTraceUnsafe(int max_steps) override;
......
......@@ -14,7 +14,10 @@
#include "mace/port/linux_base/env.h"
#include <errno.h>
#include <sys/syscall.h>
#include <sys/time.h>
#include <unistd.h>
#include <cstddef>
#include <fstream>
......@@ -28,7 +31,6 @@
namespace mace {
namespace port {
namespace {
int GetCPUCount() {
......@@ -100,5 +102,24 @@ MaceStatus LinuxBaseEnv::GetCPUMaxFreq(std::vector<float> *max_freqs) {
return MaceStatus::MACE_SUCCESS;
}
MaceStatus LinuxBaseEnv::SchedSetAffinity(const std::vector<size_t> &cpu_ids) {
cpu_set_t mask;
CPU_ZERO(&mask);
for (auto cpu_id : cpu_ids) {
CPU_SET(cpu_id, &mask);
}
pid_t pid = syscall(SYS_gettid);
int err = sched_setaffinity(pid, sizeof(mask), &mask);
if (err) {
LOG(WARNING) << "SchedSetAffinity failed: " << strerror(errno);
return MaceStatus(MaceStatus::MACE_INVALID_ARGS,
"SchedSetAffinity failed: " +
std::string(strerror(errno)));
}
return MaceStatus::MACE_SUCCESS;
}
} // namespace port
} // namespace mace
......@@ -28,6 +28,7 @@ class LinuxBaseEnv : public Env {
int64_t NowMicros() override;
MaceStatus GetCPUMaxFreq(std::vector<float> *max_freqs) override;
FileSystem *GetFileSystem() override;
MaceStatus SchedSetAffinity(const std::vector<size_t> &cpu_ids) override;
protected:
PosixFileSystem posix_file_system_;
......
......@@ -13,6 +13,8 @@
// limitations under the License.
#include <algorithm>
#include <numeric>
#include "mace/port/port.h"
#include "mace/port/env.h"
#include "mace/utils/logging.h"
......@@ -26,6 +28,8 @@ namespace utils {
constexpr int kThreadPoolSpinWaitTime = 2000000; // ns
constexpr int kTileCountPerThread = 2;
constexpr int kMaxCostUsingSingleThread = 100;
constexpr int kMinCpuCoresForPerformance = 3;
constexpr int kMaxCpuCoresForPerformance = 5;
namespace {
......@@ -42,67 +46,87 @@ struct CPUFreq {
float freq;
};
void GetCPUCoresToUse(const std::vector<float> &cpu_max_freqs,
const CPUAffinityPolicy policy,
const size_t thread_count_hint,
std::vector<size_t> *cores) {
size_t thread_count = thread_count_hint;
if (!cpu_max_freqs.empty()) {
const size_t cpu_count = cpu_max_freqs.size();
if (thread_count == 0 || thread_count > cpu_count) {
thread_count = cpu_count;
size_t GetCpuCoresForPerfomance(const std::vector<CPUFreq> &cpu_freqs) {
float total_freq = std::accumulate(cpu_freqs.begin(), cpu_freqs.end(), 0,
[](float accum, CPUFreq cpu_freq) {
return accum + cpu_freq.freq;
});
size_t valid_cpu_nums = std::count_if(cpu_freqs.begin(), cpu_freqs.end(),
[](CPUFreq cpu_freq) {
return cpu_freq.freq != 0;
});
float avg_freq = total_freq / valid_cpu_nums;
size_t cores_to_use = 0;
for (auto cpu_info : cpu_freqs) {
if ((cpu_info.freq > avg_freq
&& cores_to_use < kMaxCpuCoresForPerformance)
|| cores_to_use < kMinCpuCoresForPerformance) {
++cores_to_use;
}
}
if (policy != CPUAffinityPolicy::AFFINITY_NONE) {
std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
cpu_freq[i].core_id = i;
cpu_freq[i].freq = cpu_max_freqs[i];
}
if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[=](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq < rhs.freq;
});
} else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq > rhs.freq;
});
}
return cores_to_use;
}
// decide num of cores to use
size_t cores_to_use = 0;
if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
|| policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
if (cpu_freq[i].freq != cpu_freq[0].freq) {
break;
}
++cores_to_use;
}
} else {
cores_to_use = thread_count;
}
MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
cores->resize(cores_to_use);
for (size_t i = 0; i < cores_to_use; ++i) {
VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id
<< " with freq "
<< cpu_freq[i].freq;
(*cores)[i] = static_cast<int>(cpu_freq[i].core_id);
}
}
} else {
} // namespace
MaceStatus GetCPUCoresToUse(const std::vector<float> &cpu_max_freqs,
const CPUAffinityPolicy policy,
const size_t thread_count_hint,
std::vector<size_t> *cores) {
if (cpu_max_freqs.empty()) {
LOG(ERROR) << "CPU core is empty";
return MaceStatus::MACE_RUNTIME_ERROR;
}
size_t thread_count = thread_count_hint;
const size_t cpu_count = cpu_max_freqs.size();
if (thread_count == 0 || thread_count > cpu_count) {
thread_count = cpu_count;
}
}
} // namespace
if (policy != CPUAffinityPolicy::AFFINITY_NONE) {
std::vector<CPUFreq> cpu_freq(cpu_max_freqs.size());
for (size_t i = 0; i < cpu_max_freqs.size(); ++i) {
cpu_freq[i].core_id = i;
cpu_freq[i].freq = cpu_max_freqs[i];
}
if (policy == CPUAffinityPolicy::AFFINITY_POWER_SAVE ||
policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[=](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq < rhs.freq;
});
} else if (policy == CPUAffinityPolicy::AFFINITY_HIGH_PERFORMANCE ||
policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY) {
std::sort(cpu_freq.begin(),
cpu_freq.end(),
[](const CPUFreq &lhs, const CPUFreq &rhs) {
return lhs.freq > rhs.freq;
});
}
// decide num of cores to use
size_t cores_to_use = 0;
if (policy == CPUAffinityPolicy::AFFINITY_BIG_ONLY
|| policy == CPUAffinityPolicy::AFFINITY_LITTLE_ONLY) {
cores_to_use = GetCpuCoresForPerfomance(cpu_freq);
} else {
cores_to_use = thread_count;
}
MACE_CHECK(cores_to_use > 0, "number of cores to use should > 0");
cores->resize(cores_to_use);
for (size_t i = 0; i < cores_to_use; ++i) {
VLOG(2) << "Bind thread to core: " << cpu_freq[i].core_id
<< " with freq "
<< cpu_freq[i].freq;
(*cores)[i] = static_cast<int>(cpu_freq[i].core_id);
}
}
return MaceStatus::MACE_SUCCESS;
}
ThreadPool::ThreadPool(const size_t thread_count_hint,
const CPUAffinityPolicy policy)
......@@ -173,13 +197,13 @@ void ThreadPool::Run(const std::function<void(const int64_t)> &func,
std::unique_lock<std::mutex> run_lock(run_mutex_);
for (size_t i = 0; i < thread_count; ++i) {
int64_t count = iters_per_thread + (static_cast<int64_t>(i) < remainder);
int64_t range_len =
iters_per_thread + (static_cast<int64_t>(i) < remainder);
thread_infos_[i].range_start = iters_offset;
int64_t range_end = iters_offset + count;
thread_infos_[i].range_end = range_end;
thread_infos_[i].range_len = range_end - iters_offset;
thread_infos_[i].range_len = range_len;
thread_infos_[i].range_end = iters_offset + range_len;
thread_infos_[i].func = reinterpret_cast<uintptr_t>(&func);
iters_offset += thread_infos_[i].range_len;
iters_offset = thread_infos_[i].range_end;
}
count_down_latch_.Reset(thread_count - 1);
......
......@@ -29,6 +29,11 @@
namespace mace {
namespace utils {
MaceStatus GetCPUCoresToUse(const std::vector<float> &cpu_max_freqs,
const CPUAffinityPolicy policy,
const size_t thread_count_hint,
std::vector<size_t> *cores);
class ThreadPool {
public:
ThreadPool(const size_t thread_count,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册