alinux: sched: Introduce cfs scheduling latency histograms

to #28739709 Export wait_latency in "cpuacct.wait_latency", which indicates the time that tasks in a cpuacct cgroup wait on a cfs_rq to be scheduled. This is like "perf sched", but it gives smaller overhead. So it can be used as monitor constantly. wait_latency is useful to debug application's high RT problem. It can tell if it's caused by scheduling or not. If it is, loadavg can tell if it's caused by bad scheduling bahaviour or system overloads. System admins can also use wait_latency to define SLA. To ensure SLA is guaranteed, there are various ways to decrease wait_latency. This feature is disabled by default for performance concerns. It can be switched on dynamically by "echo 0 > /proc/cpusli/sched_lat_enable" Example: $ cat /sys/fs/cgroup/cpuacct/a/cpuacct.wait_latency 0-1ms: 4139 1-4ms: 317 4-7ms: 568 7-10ms: 0 10-100ms: 42324 100-500ms: 9131 500-1000ms: 95 1000-5000ms: 134 5000-10000ms: 0 >=10000ms: 0 total(ms): 4256455 Signed-off-by: N Yihao Wu <wuyihao@linux.alibaba.com> Acked-by: N Xunlei Pang <xlpang@linux.alibaba.com> Reviewed-by: N Shanpei Chen <shanpeic@linux.alibaba.com> Acked-by: N Michael Wang <yun.wang@linux.alibaba.com>

alinux: sched: Introduce cfs scheduling latency histograms
to #28739709 Export wait_latency in "cpuacct.wait_latency", which indicates the time that tasks in a cpuacct cgroup wait on a cfs_rq to be scheduled. This is like "perf sched", but it gives smaller overhead. So it can be used as monitor constantly. wait_latency is useful to debug application's high RT problem. It can tell if it's caused by scheduling or not. If it is, loadavg can tell if it's caused by bad scheduling bahaviour or system overloads. System admins can also use wait_latency to define SLA. To ensure SLA is guaranteed, there are various ways to decrease wait_latency. This feature is disabled by default for performance concerns. It can be switched on dynamically by "echo 0 > /proc/cpusli/sched_lat_enable" Example: $ cat /sys/fs/cgroup/cpuacct/a/cpuacct.wait_latency 0-1ms: 4139 1-4ms: 317 4-7ms: 568 7-10ms: 0 10-100ms: 42324 100-500ms: 9131 500-1000ms: 95 1000-5000ms: 134 5000-10000ms: 0 >=10000ms: 0 total(ms): 4256455 Signed-off-by: N Yihao Wu <wuyihao@linux.alibaba.com> Acked-by: N Xunlei Pang <xlpang@linux.alibaba.com> Reviewed-by: N Shanpei Chen <shanpeic@linux.alibaba.com> Acked-by: N Michael Wang <yun.wang@linux.alibaba.com>
76d98609 · Yihao Wu · bcaf8afd · 76d98609 · 76d98609 · 76d98609
隐藏空白更改
内联并排

Showing with 254 addition and 0 deletion

kernel/sched/cpuacct.c kernel/sched/cpuacct.c +250 -0

kernel/sched/fair.c kernel/sched/fair.c +1 -0

kernel/sched/sched.h kernel/sched/sched.h +3 -0

未找到文件。
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -33,6 +33,72 @@ struct cpuacct_alistats {
 } ____cacheline_aligned;
 #endif
+enum sched_lat_stat_item {
+	SCHED_LAT_WAIT,
+	SCHED_LAT_NR_STAT
+};
+/*
+ * [0, 1ms)
+ * [1, 4ms)
+ * [4, 7ms)
+ * [7, 10ms)
+ * [10, 100ms)
+ * [100, 500ms)
+ * [500, 1000ms)
+ * [1000, 5000ms)
+ * [5000, 10000ms)
+ * [10000ms, INF)
+ * total(ms)
+ */
+/* Scheduler latency histogram distribution, in milliseconds */
+enum sched_lat_count_t {
+	SCHED_LAT_0_1,
+	SCHED_LAT_1_4,
+	SCHED_LAT_4_7,
+	SCHED_LAT_7_10,
+	SCHED_LAT_10_100,
+	SCHED_LAT_100_500,
+	SCHED_LAT_500_1000,
+	SCHED_LAT_1000_5000,
+	SCHED_LAT_5000_10000,
+	SCHED_LAT_10000_INF,
+	SCHED_LAT_TOTAL,
+	SCHED_LAT_NR_COUNT,
+};
+struct sched_cgroup_lat_stat_cpu {
+	unsigned long item[SCHED_LAT_NR_STAT][SCHED_LAT_NR_COUNT];
+};
+static inline enum sched_lat_count_t get_sched_lat_count_idx(u64 msecs)
+{
+	enum sched_lat_count_t idx;
+	if (msecs < 1)
+		idx = SCHED_LAT_0_1;
+	else if (msecs < 4)
+		idx = SCHED_LAT_1_4;
+	else if (msecs < 7)
+		idx = SCHED_LAT_4_7;
+	else if (msecs < 10)
+		idx = SCHED_LAT_7_10;
+	else if (msecs < 100)
+		idx = SCHED_LAT_10_100;
+	else if (msecs < 500)
+		idx = SCHED_LAT_100_500;
+	else if (msecs < 1000)
+		idx = SCHED_LAT_500_1000;
+	else if (msecs < 5000)
+		idx = SCHED_LAT_1000_5000;
+	else if (msecs < 10000)
+		idx = SCHED_LAT_5000_10000;
+	else
+		idx = SCHED_LAT_10000_INF;
+	return idx;
+}
 /* track CPU usage of a group of tasks and its child groups */
 struct cpuacct {
 	struct cgroup_subsys_state	css;
@@ -40,6 +106,7 @@ struct cpuacct {
 	struct cpuacct_usage __percpu	*cpuusage;
 #ifdef CONFIG_SCHED_SLI
 	struct cpuacct_alistats __percpu *alistats;
+	struct sched_cgroup_lat_stat_cpu __percpu *lat_stat_cpu;
 #endif
 	struct kernel_cpustat __percpu	*cpustat;
@@ -68,16 +135,90 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
 static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
 #ifdef CONFIG_SCHED_SLI
 static DEFINE_PER_CPU(struct cpuacct_alistats, root_alistats);
+static DEFINE_PER_CPU(struct sched_cgroup_lat_stat_cpu, root_lat_stat_cpu);
 #endif
 static struct cpuacct root_cpuacct = {
 	.cpustat	= &kernel_cpustat,
 	.cpuusage	= &root_cpuacct_cpuusage,
 #ifdef CONFIG_SCHED_SLI
 	.alistats	= &root_alistats,
+	.lat_stat_cpu   = &root_lat_stat_cpu,
 #endif
 };
 #ifdef CONFIG_SCHED_SLI
+static DEFINE_STATIC_KEY_TRUE(cpuacct_no_sched_lat);
+static int cpuacct_sched_lat_enabled_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "%d\n", !static_key_enabled(&cpuacct_no_sched_lat));
+	return 0;
+}
+static int cpuacct_sched_lat_enabled_open(struct inode *inode,
+						struct file *file)
+{
+	return single_open(file, cpuacct_sched_lat_enabled_show, NULL);
+}
+static ssize_t cpuacct_sched_lat_enabled_write(struct file *file,
+						const char __user *ubuf,
+						size_t count, loff_t *ppos)
+{
+	char val = -1;
+	int ret = count;
+	if (count < 1 || *ppos) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (copy_from_user(&val, ubuf, 1)) {
+		ret = -EFAULT;
+		goto out;
+	}
+	switch (val) {
+	case '0':
+		static_branch_enable(&cpuacct_no_sched_lat);
+		break;
+	case '1':
+		static_branch_disable(&cpuacct_no_sched_lat);
+		break;
+	default:
+		ret = -EINVAL;
+	}
+out:
+	return ret;
+}
+static const struct file_operations cpuacct_sched_lat_enabled_fops = {
+	.open           = cpuacct_sched_lat_enabled_open,
+	.read           = seq_read,
+	.write          = cpuacct_sched_lat_enabled_write,
+	.llseek         = seq_lseek,
+	.release        = single_release,
+};
+static int __init init_cpuacct_sched_lat_enabled(void)
+{
+	struct proc_dir_entry *ca_dir, *sched_lat_enabled_file;
+	ca_dir = proc_mkdir("cpusli", NULL);
+	if (!ca_dir)
+		return -ENOMEM;
+	sched_lat_enabled_file = proc_create("sched_lat_enabled", 0600,
+				ca_dir, &cpuacct_sched_lat_enabled_fops);
+	if (!sched_lat_enabled_file) {
+		remove_proc_entry("cpusli", NULL);
+		return -ENOMEM;
+	}
+	return 0;
+}
+__initcall(init_cpuacct_sched_lat_enabled);
 void task_ca_increase_nr_migrations(struct task_struct *tsk)
 {
 	struct cpuacct *ca;
@@ -87,6 +228,25 @@ void task_ca_increase_nr_migrations(struct task_struct *tsk)
 	this_cpu_ptr(ca->alistats)->nr_migrations++;
 	rcu_read_unlock();
 }
+void cpuacct_update_latency(struct task_struct *tsk, u64 delta)
+{
+	enum sched_lat_count_t idx;
+	struct cpuacct *ca;
+	unsigned int msecs;
+	if (static_branch_likely(&cpuacct_no_sched_lat))
+		return;
+	rcu_read_lock();
+	ca = task_ca(tsk);
+	msecs = delta >> 20; /* Proximately to speed up */
+	idx = get_sched_lat_count_idx(msecs);
+	this_cpu_inc(ca->lat_stat_cpu->item[SCHED_LAT_WAIT][idx]);
+	this_cpu_add(ca->lat_stat_cpu->item[SCHED_LAT_WAIT][SCHED_LAT_TOTAL],
+			delta);
+	rcu_read_unlock();
+}
 #endif
 /* Create a new CPU accounting group */
@@ -115,6 +275,10 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
 	ca->alistats = alloc_percpu(struct cpuacct_alistats);
 	if (!ca->alistats)
 		goto out_free_cpustat;
+	ca->lat_stat_cpu = alloc_percpu(struct sched_cgroup_lat_stat_cpu);
+	if (!ca->lat_stat_cpu)
+		goto out_free_alistats;
 #endif
 	for_each_possible_cpu(i) {
@@ -125,6 +289,8 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
 	return &ca->css;
 #ifdef CONFIG_SCHED_SLI
+out_free_alistats:
+	free_percpu(ca->alistats);
 out_free_cpustat:
 	free_percpu(ca->cpustat);
 #endif
@@ -145,6 +311,7 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
 	free_percpu(ca->cpuusage);
 #ifdef CONFIG_SCHED_SLI
 	free_percpu(ca->alistats);
+	free_percpu(ca->lat_stat_cpu);
 #endif
 	kfree(ca);
 }
@@ -601,6 +768,83 @@ static int cpuacct_proc_stats_show(struct seq_file *sf, void *v)
 	return 0;
 }
+#define SCHED_LAT_STAT_SMP_WRITE(name, sidx)				\
+static void smp_write_##name(void *info)				\
+{									\
+	struct cpuacct *ca = (struct cpuacct *)info;		\
+	int i;								\
+									\
+	for (i = SCHED_LAT_0_1; i < SCHED_LAT_NR_COUNT; i++)		\
+		this_cpu_write(ca->lat_stat_cpu->item[sidx][i], 0);	\
+}									\
+SCHED_LAT_STAT_SMP_WRITE(sched_wait_latency, SCHED_LAT_WAIT);
+smp_call_func_t smp_sched_lat_write_funcs[] = {
+	smp_write_sched_wait_latency
+};
+static int sched_lat_stat_write(struct cgroup_subsys_state *css,
+				struct cftype *cft, u64 val)
+{
+	struct cpuacct *ca = css_ca(css);
+	enum sched_lat_stat_item idx = cft->private;
+	smp_call_func_t func = smp_sched_lat_write_funcs[idx];
+	if (val != 0)
+		return -EINVAL;
+	func((void *)ca);
+	smp_call_function(func, (void *)ca, 1);
+	return 0;
+}
+static u64 sched_lat_stat_gather(struct cpuacct *ca,
+				 enum sched_lat_stat_item sidx,
+				 enum sched_lat_count_t cidx)
+{
+	u64 sum = 0;
+	int cpu;
+	for_each_possible_cpu(cpu)
+		sum += per_cpu_ptr(ca->lat_stat_cpu, cpu)->item[sidx][cidx];
+	return sum;
+}
+static int sched_lat_stat_show(struct seq_file *sf, void *v)
+{
+	struct cpuacct *ca = css_ca(seq_css(sf));
+	enum sched_lat_stat_item s = seq_cft(sf)->private;
+	/* CFS scheduling latency cgroup and task histgrams */
+	seq_printf(sf, "0-1ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_0_1));
+	seq_printf(sf, "1-4ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_1_4));
+	seq_printf(sf, "4-7ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_4_7));
+	seq_printf(sf, "7-10ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_7_10));
+	seq_printf(sf, "10-100ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_10_100));
+	seq_printf(sf, "100-500ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_100_500));
+	seq_printf(sf, "500-1000ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_500_1000));
+	seq_printf(sf, "1000-5000ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_1000_5000));
+	seq_printf(sf, "5000-10000ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_5000_10000));
+	seq_printf(sf, ">=10000ms: \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_10000_INF));
+	seq_printf(sf, "total(ms): \t%llu\n",
+		sched_lat_stat_gather(ca, s, SCHED_LAT_TOTAL) / 1000000);
+	return 0;
+}
 #endif
 static struct cftype files[] = {
@@ -642,6 +886,12 @@ static struct cftype files[] = {
 		.name = "proc_stat",
 		.seq_show = cpuacct_proc_stats_show,
 	},
+	{
+		.name = "wait_latency",
+		.private = SCHED_LAT_WAIT,
+		.write_u64 = sched_lat_stat_write,
+		.seq_show = sched_lat_stat_show
+	},
 #endif
 	{ }	/* terminate */
 };

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -890,6 +890,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			return;
 		}
 		trace_sched_stat_wait(p, delta);
+		cpuacct_update_latency(p, delta);
 	}
 	__schedstat_set(se->statistics.wait_max,

--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2282,8 +2282,11 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
 extern u64 get_idle_time(int cpu);
 extern u64 get_iowait_time(int cpu);
 extern void task_ca_increase_nr_migrations(struct task_struct *tsk);
+void cpuacct_update_latency(struct task_struct *tsk, u64 delta);
 #else
 static inline void task_ca_increase_nr_migrations(struct task_struct *tsk) { }
+static inline void cpuacct_update_latency(struct task_struct *tsk,
+		u64 delta) { }
 #endif
 #ifdef CONFIG_PSI