提交 f7b617be 编写于 作者: X Xie XiuQi

sched/cputime: use sched idle time accounting

hulk inclusion
category: bugfix
bugzilla: 13257
CVE: NA

-------------------------------------------------

Interduce "use-sched-idle-time" option to enable/disable
sched idle time accounting instead of tick based time accounting.

nohz mode: use get_idle_time;
nohz=off mode: use idle time accounting in sched_info_switch function.

And you can also use "use-sched-idle-time=force" force to use
idle time accountint in sched_info_switch function.

guarantee:
cpu_clock(cpu) - idle == utime + stime

We use this time in /proc/stat.

We use get_idle_time instead of collecting idle time when
context switch. And get_idle_time is available when nohz is
enabled. So do not collect idle time when nohz is enabled.

Issue 1:
1) boot euleros
2) bind a kernel thread to a cpu (eg. cpu1), run a long time (more than 10mins)
3) stop this kernel thread
4) run a spin loop process in userspace
5) then, the utime is 0, but the stime is ~100%

for example:
stime = 10000; utime = 1; rtime = 100001;
Run 10s in userspace now, then

rtime = clock - idle , (10001 + 10 = 10011)
stime = 10 * 10000/(1 + 10 + 10000) ≈ 10
utime = 10 * (1 + 10)/(1 + 10 + 10000) ≈ 0

That's the problem.

We just adjust the delta time to reduce the accumulated:
error of stime/utime:
delta = rtime - (utime + stime)
stime += delta * delta_of_stat_stime / (delta_of_stat_stime + delta_of_stat_utime)
utime = rtime - stime

Issue 2:
When cpu is offline we don't need to adjust the user and system time.
Only cpu is online we could asume:

utime% + stime% + idle% = 100%.

Issue 3:
When nohz=off, we must add 'now - last_entry_idle_time' to sum_idle_time.
Tested-by: NXie Zhipeng <xiezhipeng1@huawei.com>
Signed-off-by: NXie XiuQi <xiexiuqi@huawei.com>
Signed-off-by: NYang Yingliang <yangyingliang@huawei.com>
上级 6bde0fc2
......@@ -45,14 +45,16 @@ static u64 get_iowait_time(int cpu)
#else
static u64 get_idle_time(int cpu)
u64 get_idle_time(int cpu)
{
u64 idle, idle_usecs = -1ULL;
if (cpu_online(cpu))
idle_usecs = get_cpu_idle_time_us(cpu, NULL);
if (idle_usecs == -1ULL)
if (idle_usecs == -1ULL && use_sched_idle_time)
return sched_get_idle_time(cpu);
else if (idle_usecs == -1ULL)
/* !NO_HZ or cpu offline so we can rely on cpustat.idle */
idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE];
else
......@@ -95,9 +97,19 @@ static int show_stat(struct seq_file *p, void *v)
getboottime64(&boottime);
for_each_possible_cpu(i) {
if (use_sched_idle_time && cpu_online(i)) {
u64 u = 0, s = 0;
sched_idle_time_adjust(i, &u, &s);
user += u;
system += s;
} else {
user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
}
nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE];
idle += get_idle_time(i);
iowait += get_iowait_time(i);
irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
......@@ -131,9 +143,13 @@ static int show_stat(struct seq_file *p, void *v)
for_each_online_cpu(i) {
/* Copy values here to work around gcc-2.95.3, gcc-2.96 */
if (use_sched_idle_time) {
sched_idle_time_adjust(i, &user, &system);
} else {
user = kcpustat_cpu(i).cpustat[CPUTIME_USER];
nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM];
}
nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE];
idle = get_idle_time(i);
iowait = get_iowait_time(i);
irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ];
......
......@@ -186,4 +186,9 @@ static inline void prev_cputime_init(struct prev_cputime *prev)
extern unsigned long long
task_sched_runtime(struct task_struct *task);
extern int use_sched_idle_time;
extern int sched_idle_time_adjust(int cpu, u64 *utime, u64 *stime);
extern unsigned long long sched_get_idle_time(int cpu);
extern u64 get_idle_time(int cpu);
#endif /* _LINUX_SCHED_CPUTIME_H */
......@@ -22,9 +22,11 @@ static inline void nohz_balance_enter_idle(int cpu) { }
#endif
#ifdef CONFIG_NO_HZ_COMMON
extern unsigned long tick_nohz_active;
void calc_load_nohz_start(void);
void calc_load_nohz_stop(void);
#else
#define tick_nohz_active (0)
static inline void calc_load_nohz_start(void) { }
static inline void calc_load_nohz_stop(void) { }
#endif /* CONFIG_NO_HZ_COMMON */
......
......@@ -5913,6 +5913,17 @@ static struct kmem_cache *task_group_cache __read_mostly;
DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
static __init void rq_cputime_init(void)
{
int cpu;
struct rq_cputime *rq_cputime;
for_each_possible_cpu(cpu) {
rq_cputime = &per_cpu(rq_cputimes, cpu);
raw_spin_lock_init(&rq_cputime->lock);
}
}
void __init sched_init(void)
{
int i, j;
......@@ -6074,6 +6085,9 @@ void __init sched_init(void)
init_schedstats();
if (use_sched_idle_time)
rq_cputime_init();
scheduler_running = 1;
}
......
......@@ -567,6 +567,74 @@ static u64 scale_stime(u64 stime, u64 rtime, u64 total)
return scaled;
}
int use_sched_idle_time __read_mostly;
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq_cputime, rq_cputimes);
static int __init sched_idle_time_setup(char *str)
{
use_sched_idle_time = 1;
return 0;
}
early_param("use-sched-idle-time", sched_idle_time_setup);
int sched_idle_time_adjust(int cpu, u64 *utime, u64 *stime)
{
struct rq_cputime *rq_cputime = &per_cpu(rq_cputimes, cpu);
struct cputime *prev = &rq_cputime->cpu_prev_time;
struct cputime *last = &rq_cputime->cpu_last_time;
u64 ut, st, delta, delta_ut, delta_st;
raw_spin_lock(&rq_cputime->lock);
delta = cpu_clock(cpu) - get_idle_time(cpu)
- (prev->utime + prev->stime);
ut = kcpustat_cpu(cpu).cpustat[CPUTIME_USER];
st = kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM];
delta_ut = ut - last->utime;
delta_st = st - last->stime;
if (unlikely((s64)delta <= 0))
goto out;
if (delta_st == 0) {
prev->utime += delta;
} else if (delta_ut == 0) {
prev->stime += delta;
} else {
delta_st = scale_stime(delta_st, delta, delta_ut + delta_st);
if (unlikely(delta_st > delta))
delta_st = delta;
prev->stime += delta_st;
prev->utime += delta - delta_st;
}
out:
last->utime = ut;
last->stime = st;
*utime = prev->utime;
*stime = prev->stime;
raw_spin_unlock(&rq_cputime->lock);
return 0;
}
unsigned long long sched_get_idle_time(int cpu)
{
struct rq_cputime *rt = &per_cpu(rq_cputimes, cpu);
if (is_idle_task(curr_task(cpu)))
return rt->sum_idle_time + cpu_clock(cpu) - rt->last_entry_idle;
else
return rt->sum_idle_time;
}
/*
* Adjust tick based cputime random precision against scheduler runtime
* accounting.
......
......@@ -919,6 +919,21 @@ struct rq {
#endif
};
struct cputime {
u64 utime;
u64 stime;
};
struct rq_cputime {
raw_spinlock_t lock;
unsigned long long sum_idle_time;
unsigned long long last_entry_idle;
struct cputime cpu_prev_time;
struct cputime cpu_last_time;
};
DECLARE_PER_CPU(struct rq_cputime, rq_cputimes);
static inline int cpu_of(struct rq *rq)
{
#ifdef CONFIG_SMP
......
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/sched/cputime.h>
#ifdef CONFIG_SCHEDSTATS
......@@ -153,6 +154,20 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct
static inline void
sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next)
{
if (use_sched_idle_time && !tick_nohz_active) {
struct rq *rq = task_rq(prev);
struct rq_cputime *rq_cputime = this_cpu_ptr(&rq_cputimes);
unsigned long long now = cpu_clock(cpu_of(rq)), delta = 0;
if (prev == rq->idle) {
delta = now - rq_cputime->last_entry_idle;
rq_cputime->sum_idle_time += delta;
}
if (next == rq->idle)
rq_cputime->last_entry_idle = now;
}
if (unlikely(sched_info_on()))
__sched_info_switch(rq, prev, next);
}
......
......@@ -149,14 +149,12 @@ static inline void tick_nohz_init(void) { }
#endif
#ifdef CONFIG_NO_HZ_COMMON
extern unsigned long tick_nohz_active;
extern void timers_update_nohz(void);
# ifdef CONFIG_SMP
extern struct static_key_false timers_migration_enabled;
# endif
#else /* CONFIG_NO_HZ_COMMON */
static inline void timers_update_nohz(void) { }
#define tick_nohz_active (0)
#endif
DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册