alinux: sched: Add cpu_stress to show system-wide task waiting

to #28739709 /proc/loadavg can reflex the waiting tasks over a period of time to some extent. But to become a SLI requires better precision and quicker response. Furthermore, I/O block is not concerned here, and bandwidth control is excluded from cpu_stress. This patch adds a new interface /proc/cpu_stress. It's based on task runtime tracking so we don't need to deal with complex state transition. And because task runtime tracking is done in most scheduler events, the precision is quite enough. Like loadavg, cpu_stress has 3 average windows too (1,5,15 min) Signed-off-by: N Yihao Wu <wuyihao@linux.alibaba.com> Reviewed-by: N Xunlei Pang <xlpang@linux.alibaba.com>

alinux: sched: Add cpu_stress to show system-wide task waiting
to #28739709 /proc/loadavg can reflex the waiting tasks over a period of time to some extent. But to become a SLI requires better precision and quicker response. Furthermore, I/O block is not concerned here, and bandwidth control is excluded from cpu_stress. This patch adds a new interface /proc/cpu_stress. It's based on task runtime tracking so we don't need to deal with complex state transition. And because task runtime tracking is done in most scheduler events, the precision is quite enough. Like loadavg, cpu_stress has 3 average windows too (1,5,15 min) Signed-off-by: N Yihao Wu <wuyihao@linux.alibaba.com> Reviewed-by: N Xunlei Pang <xlpang@linux.alibaba.com>
ab81d2d9 · Yihao Wu · Caspar Zhang · 4d76552e · ab81d2d9 · ab81d2d9
5 changed file
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -25,9 +25,40 @@ static int loadavg_proc_show(struct seq_file *m, void *v)
 	return 0;
 }

+static inline void get_aven_stress(u64 *stress, u64 offset)
+{
+	stress[0] = stress_avg_total[0] + offset;
+	stress[1] = stress_avg_total[1] + offset;
+	stress[2] = stress_avg_total[2] + offset;
+}
+
+static int cpu_stress_proc_show(struct seq_file *m, void *v)
+{
+	u64 avn_stress[3];
+
+	get_aven_stress(avn_stress, FIXED_1/200);
+
+	seq_printf(m, "%llu.%02llu %llu.%02llu %llu.%02llu\n",
+		LOAD_INT(avn_stress[0]), LOAD_FRAC(avn_stress[0]),
+		LOAD_INT(avn_stress[1]), LOAD_FRAC(avn_stress[1]),
+		LOAD_INT(avn_stress[2]), LOAD_FRAC(avn_stress[2]));
+
+	return 0;
+}
+
 static int __init proc_loadavg_init(void)
 {
 	proc_create_single("loadavg", 0, NULL, loadavg_proc_show);
 	return 0;
 }
 fs_initcall(proc_loadavg_init);
+
+static int __init proc_cpu_stress_init(void)
+{
+	proc_create_single("cpu_stress", 0, NULL, cpu_stress_proc_show);
+
+	/* sched_init is called earlier than init_timers so schedule it here */
+	schedule_stress();
+	return 0;
+}
+fs_initcall(proc_cpu_stress_init);
--- a/include/linux/sched/loadavg.h
+++ b/include/linux/sched/loadavg.h
@@ -44,5 +44,7 @@ extern unsigned long calc_load_n(unsigned long load, unsigned long exp,
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)

 extern void calc_global_load(unsigned long ticks);
+extern u64 stress_avg_total[3];
+extern void schedule_stress(void);

 #endif /* _LINUX_SCHED_LOADAVG_H */
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -818,6 +818,14 @@ update_exec_raw(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 	curr->exec_start_raw = now;
 }

+static inline void cpu_stress_update(struct rq *rq, u64 delta_exec)
+{
+	if (rq->nr_running > 1) {
+		atomic64_add(delta_exec * (rq->nr_running - 1),
+				&rq->cpu_stress);
+	}
+}
+
 /*
 * Update the current task's runtime statistics.
 */
@@ -851,6 +859,8 @@ static void update_curr(struct cfs_rq *cfs_rq)
 		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 		cgroup_account_cputime(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
+
+		cpu_stress_update(rq_of(cfs_rq), delta_exec);
 	}

 	account_cfs_rq_runtime(cfs_rq, delta_exec);

--- a/kernel/sched/loadavg.c
+++ b/kernel/sched/loadavg.c
@@ -383,3 +383,68 @@ void calc_global_load_tick(struct rq *this_rq)

 	this_rq->calc_load_update += LOAD_FREQ;
 }
+
+static void calc_stress_avgs_work(struct work_struct *work);
+#define STRESS_FREQ (5*HZ+1)
+u64 stress_avg_total[3];
+static u64 stress_avg_next;
+static u64 stress_avg_last;
+static u64 stress_period __read_mostly;
+DECLARE_DELAYED_WORK(stress_delayed_work, calc_stress_avgs_work);
+
+static void calc_avgs(u64 avg[3], int missed_periods, u64 stress, u64 period)
+{
+	unsigned long pct;
+
+	/* Fill in zeroes for periods of no activity */
+	if (missed_periods) {
+		avg[0] = calc_load_n(avg[0], EXP_1, 0, missed_periods);
+		avg[1] = calc_load_n(avg[1], EXP_5, 0, missed_periods);
+		avg[2] = calc_load_n(avg[2], EXP_15, 0, missed_periods);
+	}
+
+	/* Sample the most recent active period */
+	if (period == 0)
+		period = 1;
+	pct = div64_u64(stress, period);
+	pct *= FIXED_1;
+	avg[0] = calc_load(avg[0], EXP_1, pct);
+	avg[1] = calc_load(avg[1], EXP_5, pct);
+	avg[2] = calc_load(avg[2], EXP_15, pct);
+}
+
+static void calc_stress_avgs_work(struct work_struct *work)
+{
+	int cpu;
+	struct rq *rq;
+	unsigned long delay_ticks;
+	u64 now, stress, period, missed_periods = 0, stress_total = 0;
+
+	now = sched_clock();
+
+	if (now - stress_avg_next >= stress_period)
+		missed_periods = div64_u64(now - stress_avg_next,
+						stress_period);
+
+	period = now - (stress_avg_last + (missed_periods * stress_period));
+	stress_avg_last = now;
+
+	for_each_possible_cpu(cpu) {
+		rq = cpu_rq(cpu);
+
+		stress = atomic64_xchg(&rq->cpu_stress, 0);
+		stress_total += stress;
+	}
+
+	calc_avgs(stress_avg_total, missed_periods, stress_total, period);
+
+	stress_avg_next += (1 + missed_periods) * stress_period;
+	delay_ticks = nsecs_to_jiffies(stress_avg_next - now) + 1;
+	schedule_delayed_work(&stress_delayed_work, delay_ticks);
+}
+
+void schedule_stress(void)
+{
+	stress_period = jiffies_to_nsecs(STRESS_FREQ);
+	schedule_delayed_work(&stress_delayed_work, STRESS_FREQ);
+}
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -885,6 +885,8 @@ struct rq {
 	unsigned long		calc_load_update;
 	long			calc_load_active;

+	atomic64_t		cpu_stress;
+
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
 	int			hrtick_csd_pending;