From f7b617bef2e6f974541e4ff2d04df9b9ad3190a3 Mon Sep 17 00:00:00 2001 From: Xie XiuQi Date: Sat, 30 Mar 2019 18:04:41 +0800 Subject: [PATCH] sched/cputime: use sched idle time accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hulk inclusion category: bugfix bugzilla: 13257 CVE: NA ------------------------------------------------- Interduce "use-sched-idle-time" option to enable/disable sched idle time accounting instead of tick based time accounting. nohz mode: use get_idle_time; nohz=off mode: use idle time accounting in sched_info_switch function. And you can also use "use-sched-idle-time=force" force to use idle time accountint in sched_info_switch function. guarantee: cpu_clock(cpu) - idle == utime + stime We use this time in /proc/stat. We use get_idle_time instead of collecting idle time when context switch. And get_idle_time is available when nohz is enabled. So do not collect idle time when nohz is enabled. Issue 1: 1) boot euleros 2) bind a kernel thread to a cpu (eg. cpu1), run a long time (more than 10mins) 3) stop this kernel thread 4) run a spin loop process in userspace 5) then, the utime is 0, but the stime is ~100% for example: stime = 10000; utime = 1; rtime = 100001; Run 10s in userspace now, then rtime = clock - idle , (10001 + 10 = 10011) stime = 10 * 10000/(1 + 10 + 10000) ≈ 10 utime = 10 * (1 + 10)/(1 + 10 + 10000) ≈ 0 That's the problem. We just adjust the delta time to reduce the accumulated: error of stime/utime: delta = rtime - (utime + stime) stime += delta * delta_of_stat_stime / (delta_of_stat_stime + delta_of_stat_utime) utime = rtime - stime Issue 2: When cpu is offline we don't need to adjust the user and system time. Only cpu is online we could asume: utime% + stime% + idle% = 100%. Issue 3: When nohz=off, we must add 'now - last_entry_idle_time' to sum_idle_time. Tested-by: Xie Zhipeng Signed-off-by: Xie XiuQi Signed-off-by: Yang Yingliang --- fs/proc/stat.c | 28 +++++++++++---- include/linux/sched/cputime.h | 5 +++ include/linux/sched/nohz.h | 2 ++ kernel/sched/core.c | 14 ++++++++ kernel/sched/cputime.c | 68 +++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 15 ++++++++ kernel/sched/stats.h | 15 ++++++++ kernel/time/tick-internal.h | 2 -- 8 files changed, 141 insertions(+), 8 deletions(-) diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 535eda7857cf..7e832b24847d 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -45,14 +45,16 @@ static u64 get_iowait_time(int cpu) #else -static u64 get_idle_time(int cpu) +u64 get_idle_time(int cpu) { u64 idle, idle_usecs = -1ULL; if (cpu_online(cpu)) idle_usecs = get_cpu_idle_time_us(cpu, NULL); - if (idle_usecs == -1ULL) + if (idle_usecs == -1ULL && use_sched_idle_time) + return sched_get_idle_time(cpu); + else if (idle_usecs == -1ULL) /* !NO_HZ or cpu offline so we can rely on cpustat.idle */ idle = kcpustat_cpu(cpu).cpustat[CPUTIME_IDLE]; else @@ -95,9 +97,19 @@ static int show_stat(struct seq_file *p, void *v) getboottime64(&boottime); for_each_possible_cpu(i) { - user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + if (use_sched_idle_time && cpu_online(i)) { + u64 u = 0, s = 0; + + sched_idle_time_adjust(i, &u, &s); + + user += u; + system += s; + } else { + user += kcpustat_cpu(i).cpustat[CPUTIME_USER]; + system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + } + nice += kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system += kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; idle += get_idle_time(i); iowait += get_iowait_time(i); irq += kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; @@ -131,9 +143,13 @@ static int show_stat(struct seq_file *p, void *v) for_each_online_cpu(i) { /* Copy values here to work around gcc-2.95.3, gcc-2.96 */ - user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + if (use_sched_idle_time) { + sched_idle_time_adjust(i, &user, &system); + } else { + user = kcpustat_cpu(i).cpustat[CPUTIME_USER]; + system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; + } nice = kcpustat_cpu(i).cpustat[CPUTIME_NICE]; - system = kcpustat_cpu(i).cpustat[CPUTIME_SYSTEM]; idle = get_idle_time(i); iowait = get_iowait_time(i); irq = kcpustat_cpu(i).cpustat[CPUTIME_IRQ]; diff --git a/include/linux/sched/cputime.h b/include/linux/sched/cputime.h index 53f883f5a2fd..1ebbeec02051 100644 --- a/include/linux/sched/cputime.h +++ b/include/linux/sched/cputime.h @@ -186,4 +186,9 @@ static inline void prev_cputime_init(struct prev_cputime *prev) extern unsigned long long task_sched_runtime(struct task_struct *task); +extern int use_sched_idle_time; +extern int sched_idle_time_adjust(int cpu, u64 *utime, u64 *stime); +extern unsigned long long sched_get_idle_time(int cpu); +extern u64 get_idle_time(int cpu); + #endif /* _LINUX_SCHED_CPUTIME_H */ diff --git a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h index b36f4cf38111..631d8579c257 100644 --- a/include/linux/sched/nohz.h +++ b/include/linux/sched/nohz.h @@ -22,9 +22,11 @@ static inline void nohz_balance_enter_idle(int cpu) { } #endif #ifdef CONFIG_NO_HZ_COMMON +extern unsigned long tick_nohz_active; void calc_load_nohz_start(void); void calc_load_nohz_stop(void); #else +#define tick_nohz_active (0) static inline void calc_load_nohz_start(void) { } static inline void calc_load_nohz_stop(void) { } #endif /* CONFIG_NO_HZ_COMMON */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2db1d0600a28..5f1e8362c97a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5913,6 +5913,17 @@ static struct kmem_cache *task_group_cache __read_mostly; DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); DECLARE_PER_CPU(cpumask_var_t, select_idle_mask); +static __init void rq_cputime_init(void) +{ + int cpu; + struct rq_cputime *rq_cputime; + + for_each_possible_cpu(cpu) { + rq_cputime = &per_cpu(rq_cputimes, cpu); + raw_spin_lock_init(&rq_cputime->lock); + } +} + void __init sched_init(void) { int i, j; @@ -6074,6 +6085,9 @@ void __init sched_init(void) init_schedstats(); + if (use_sched_idle_time) + rq_cputime_init(); + scheduler_running = 1; } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 0796f938c4f0..ded9d62b1fe4 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -567,6 +567,74 @@ static u64 scale_stime(u64 stime, u64 rtime, u64 total) return scaled; } +int use_sched_idle_time __read_mostly; +DEFINE_PER_CPU_SHARED_ALIGNED(struct rq_cputime, rq_cputimes); + +static int __init sched_idle_time_setup(char *str) +{ + use_sched_idle_time = 1; + + return 0; +} +early_param("use-sched-idle-time", sched_idle_time_setup); + +int sched_idle_time_adjust(int cpu, u64 *utime, u64 *stime) +{ + struct rq_cputime *rq_cputime = &per_cpu(rq_cputimes, cpu); + struct cputime *prev = &rq_cputime->cpu_prev_time; + struct cputime *last = &rq_cputime->cpu_last_time; + u64 ut, st, delta, delta_ut, delta_st; + + raw_spin_lock(&rq_cputime->lock); + + delta = cpu_clock(cpu) - get_idle_time(cpu) + - (prev->utime + prev->stime); + + ut = kcpustat_cpu(cpu).cpustat[CPUTIME_USER]; + st = kcpustat_cpu(cpu).cpustat[CPUTIME_SYSTEM]; + + delta_ut = ut - last->utime; + delta_st = st - last->stime; + + if (unlikely((s64)delta <= 0)) + goto out; + + if (delta_st == 0) { + prev->utime += delta; + } else if (delta_ut == 0) { + prev->stime += delta; + } else { + delta_st = scale_stime(delta_st, delta, delta_ut + delta_st); + + if (unlikely(delta_st > delta)) + delta_st = delta; + + prev->stime += delta_st; + prev->utime += delta - delta_st; + } + +out: + last->utime = ut; + last->stime = st; + + *utime = prev->utime; + *stime = prev->stime; + + raw_spin_unlock(&rq_cputime->lock); + + return 0; +} + +unsigned long long sched_get_idle_time(int cpu) +{ + struct rq_cputime *rt = &per_cpu(rq_cputimes, cpu); + + if (is_idle_task(curr_task(cpu))) + return rt->sum_idle_time + cpu_clock(cpu) - rt->last_entry_idle; + else + return rt->sum_idle_time; +} + /* * Adjust tick based cputime random precision against scheduler runtime * accounting. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b63172288f7b..bc001966315f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -919,6 +919,21 @@ struct rq { #endif }; +struct cputime { + u64 utime; + u64 stime; +}; + +struct rq_cputime { + raw_spinlock_t lock; + unsigned long long sum_idle_time; + unsigned long long last_entry_idle; + struct cputime cpu_prev_time; + struct cputime cpu_last_time; +}; + +DECLARE_PER_CPU(struct rq_cputime, rq_cputimes); + static inline int cpu_of(struct rq *rq) { #ifdef CONFIG_SMP diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index 8aea199a39b4..27b9980a0e9a 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -1,4 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#include #ifdef CONFIG_SCHEDSTATS @@ -153,6 +154,20 @@ __sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct static inline void sched_info_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { + if (use_sched_idle_time && !tick_nohz_active) { + struct rq *rq = task_rq(prev); + struct rq_cputime *rq_cputime = this_cpu_ptr(&rq_cputimes); + unsigned long long now = cpu_clock(cpu_of(rq)), delta = 0; + + if (prev == rq->idle) { + delta = now - rq_cputime->last_entry_idle; + rq_cputime->sum_idle_time += delta; + } + + if (next == rq->idle) + rq_cputime->last_entry_idle = now; + } + if (unlikely(sched_info_on())) __sched_info_switch(rq, prev, next); } diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index e277284c2831..9ff03c4e7e92 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -149,14 +149,12 @@ static inline void tick_nohz_init(void) { } #endif #ifdef CONFIG_NO_HZ_COMMON -extern unsigned long tick_nohz_active; extern void timers_update_nohz(void); # ifdef CONFIG_SMP extern struct static_key_false timers_migration_enabled; # endif #else /* CONFIG_NO_HZ_COMMON */ static inline void timers_update_nohz(void) { } -#define tick_nohz_active (0) #endif DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases); -- GitLab