From ab81d2d9f7450c048c4f8eab118e7b155e0536b5 Mon Sep 17 00:00:00 2001 From: Yihao Wu Date: Mon, 1 Jun 2020 11:12:44 +0800 Subject: [PATCH] alinux: sched: Add cpu_stress to show system-wide task waiting to #28739709 /proc/loadavg can reflex the waiting tasks over a period of time to some extent. But to become a SLI requires better precision and quicker response. Furthermore, I/O block is not concerned here, and bandwidth control is excluded from cpu_stress. This patch adds a new interface /proc/cpu_stress. It's based on task runtime tracking so we don't need to deal with complex state transition. And because task runtime tracking is done in most scheduler events, the precision is quite enough. Like loadavg, cpu_stress has 3 average windows too (1,5,15 min) Signed-off-by: Yihao Wu Reviewed-by: Xunlei Pang --- fs/proc/loadavg.c | 31 +++++++++++++++++ include/linux/sched/loadavg.h | 2 ++ kernel/sched/fair.c | 10 ++++++ kernel/sched/loadavg.c | 65 +++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 2 ++ 5 files changed, 110 insertions(+) diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 8468baee951d..a14813f226d6 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -25,9 +25,40 @@ static int loadavg_proc_show(struct seq_file *m, void *v) return 0; } +static inline void get_aven_stress(u64 *stress, u64 offset) +{ + stress[0] = stress_avg_total[0] + offset; + stress[1] = stress_avg_total[1] + offset; + stress[2] = stress_avg_total[2] + offset; +} + +static int cpu_stress_proc_show(struct seq_file *m, void *v) +{ + u64 avn_stress[3]; + + get_aven_stress(avn_stress, FIXED_1/200); + + seq_printf(m, "%llu.%02llu %llu.%02llu %llu.%02llu\n", + LOAD_INT(avn_stress[0]), LOAD_FRAC(avn_stress[0]), + LOAD_INT(avn_stress[1]), LOAD_FRAC(avn_stress[1]), + LOAD_INT(avn_stress[2]), LOAD_FRAC(avn_stress[2])); + + return 0; +} + static int __init proc_loadavg_init(void) { proc_create_single("loadavg", 0, NULL, loadavg_proc_show); return 0; } fs_initcall(proc_loadavg_init); + +static int __init proc_cpu_stress_init(void) +{ + proc_create_single("cpu_stress", 0, NULL, cpu_stress_proc_show); + + /* sched_init is called earlier than init_timers so schedule it here */ + schedule_stress(); + return 0; +} +fs_initcall(proc_cpu_stress_init); diff --git a/include/linux/sched/loadavg.h b/include/linux/sched/loadavg.h index 4859bea47a7b..27a3ea0dd465 100644 --- a/include/linux/sched/loadavg.h +++ b/include/linux/sched/loadavg.h @@ -44,5 +44,7 @@ extern unsigned long calc_load_n(unsigned long load, unsigned long exp, #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) extern void calc_global_load(unsigned long ticks); +extern u64 stress_avg_total[3]; +extern void schedule_stress(void); #endif /* _LINUX_SCHED_LOADAVG_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b5fd5755ac73..44b549ca837d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -818,6 +818,14 @@ update_exec_raw(struct cfs_rq *cfs_rq, struct sched_entity *curr) curr->exec_start_raw = now; } +static inline void cpu_stress_update(struct rq *rq, u64 delta_exec) +{ + if (rq->nr_running > 1) { + atomic64_add(delta_exec * (rq->nr_running - 1), + &rq->cpu_stress); + } +} + /* * Update the current task's runtime statistics. */ @@ -851,6 +859,8 @@ static void update_curr(struct cfs_rq *cfs_rq) trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime); cgroup_account_cputime(curtask, delta_exec); account_group_exec_runtime(curtask, delta_exec); + + cpu_stress_update(rq_of(cfs_rq), delta_exec); } account_cfs_rq_runtime(cfs_rq, delta_exec); diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c index 28a516575c18..2d1f90697754 100644 --- a/kernel/sched/loadavg.c +++ b/kernel/sched/loadavg.c @@ -383,3 +383,68 @@ void calc_global_load_tick(struct rq *this_rq) this_rq->calc_load_update += LOAD_FREQ; } + +static void calc_stress_avgs_work(struct work_struct *work); +#define STRESS_FREQ (5*HZ+1) +u64 stress_avg_total[3]; +static u64 stress_avg_next; +static u64 stress_avg_last; +static u64 stress_period __read_mostly; +DECLARE_DELAYED_WORK(stress_delayed_work, calc_stress_avgs_work); + +static void calc_avgs(u64 avg[3], int missed_periods, u64 stress, u64 period) +{ + unsigned long pct; + + /* Fill in zeroes for periods of no activity */ + if (missed_periods) { + avg[0] = calc_load_n(avg[0], EXP_1, 0, missed_periods); + avg[1] = calc_load_n(avg[1], EXP_5, 0, missed_periods); + avg[2] = calc_load_n(avg[2], EXP_15, 0, missed_periods); + } + + /* Sample the most recent active period */ + if (period == 0) + period = 1; + pct = div64_u64(stress, period); + pct *= FIXED_1; + avg[0] = calc_load(avg[0], EXP_1, pct); + avg[1] = calc_load(avg[1], EXP_5, pct); + avg[2] = calc_load(avg[2], EXP_15, pct); +} + +static void calc_stress_avgs_work(struct work_struct *work) +{ + int cpu; + struct rq *rq; + unsigned long delay_ticks; + u64 now, stress, period, missed_periods = 0, stress_total = 0; + + now = sched_clock(); + + if (now - stress_avg_next >= stress_period) + missed_periods = div64_u64(now - stress_avg_next, + stress_period); + + period = now - (stress_avg_last + (missed_periods * stress_period)); + stress_avg_last = now; + + for_each_possible_cpu(cpu) { + rq = cpu_rq(cpu); + + stress = atomic64_xchg(&rq->cpu_stress, 0); + stress_total += stress; + } + + calc_avgs(stress_avg_total, missed_periods, stress_total, period); + + stress_avg_next += (1 + missed_periods) * stress_period; + delay_ticks = nsecs_to_jiffies(stress_avg_next - now) + 1; + schedule_delayed_work(&stress_delayed_work, delay_ticks); +} + +void schedule_stress(void) +{ + stress_period = jiffies_to_nsecs(STRESS_FREQ); + schedule_delayed_work(&stress_delayed_work, STRESS_FREQ); +} diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 83a2962822f6..228ed28e6236 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -885,6 +885,8 @@ struct rq { unsigned long calc_load_update; long calc_load_active; + atomic64_t cpu_stress; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; -- GitLab