提交 76d98609 编写于 作者: Y Yihao Wu

alinux: sched: Introduce cfs scheduling latency histograms

to #28739709

Export wait_latency in "cpuacct.wait_latency", which indicates the
time that tasks in a cpuacct cgroup wait on a cfs_rq to be scheduled.

This is like "perf sched", but it gives smaller overhead. So it can
be used as monitor constantly.

wait_latency is useful to debug application's high RT problem. It can
tell if it's caused by scheduling or not. If it is, loadavg can tell
if it's caused by bad scheduling bahaviour or system overloads.

System admins can also use wait_latency to define SLA. To ensure SLA
is guaranteed, there are various ways to decrease wait_latency.

This feature is disabled by default for performance concerns. It can
be switched on dynamically by "echo 0 > /proc/cpusli/sched_lat_enable"

Example:

  $ cat /sys/fs/cgroup/cpuacct/a/cpuacct.wait_latency
    0-1ms:  4139
    1-4ms:  317
    4-7ms:  568
    7-10ms:         0
    10-100ms:       42324
    100-500ms:      9131
    500-1000ms:     95
    1000-5000ms:    134
    5000-10000ms:   0
    >=10000ms:      0
    total(ms):      4256455
Signed-off-by: NYihao Wu <wuyihao@linux.alibaba.com>
Acked-by: NXunlei Pang <xlpang@linux.alibaba.com>
Reviewed-by: NShanpei Chen <shanpeic@linux.alibaba.com>
Acked-by: NMichael Wang <yun.wang@linux.alibaba.com>
上级 bcaf8afd
...@@ -33,6 +33,72 @@ struct cpuacct_alistats { ...@@ -33,6 +33,72 @@ struct cpuacct_alistats {
} ____cacheline_aligned; } ____cacheline_aligned;
#endif #endif
enum sched_lat_stat_item {
SCHED_LAT_WAIT,
SCHED_LAT_NR_STAT
};
/*
* [0, 1ms)
* [1, 4ms)
* [4, 7ms)
* [7, 10ms)
* [10, 100ms)
* [100, 500ms)
* [500, 1000ms)
* [1000, 5000ms)
* [5000, 10000ms)
* [10000ms, INF)
* total(ms)
*/
/* Scheduler latency histogram distribution, in milliseconds */
enum sched_lat_count_t {
SCHED_LAT_0_1,
SCHED_LAT_1_4,
SCHED_LAT_4_7,
SCHED_LAT_7_10,
SCHED_LAT_10_100,
SCHED_LAT_100_500,
SCHED_LAT_500_1000,
SCHED_LAT_1000_5000,
SCHED_LAT_5000_10000,
SCHED_LAT_10000_INF,
SCHED_LAT_TOTAL,
SCHED_LAT_NR_COUNT,
};
struct sched_cgroup_lat_stat_cpu {
unsigned long item[SCHED_LAT_NR_STAT][SCHED_LAT_NR_COUNT];
};
static inline enum sched_lat_count_t get_sched_lat_count_idx(u64 msecs)
{
enum sched_lat_count_t idx;
if (msecs < 1)
idx = SCHED_LAT_0_1;
else if (msecs < 4)
idx = SCHED_LAT_1_4;
else if (msecs < 7)
idx = SCHED_LAT_4_7;
else if (msecs < 10)
idx = SCHED_LAT_7_10;
else if (msecs < 100)
idx = SCHED_LAT_10_100;
else if (msecs < 500)
idx = SCHED_LAT_100_500;
else if (msecs < 1000)
idx = SCHED_LAT_500_1000;
else if (msecs < 5000)
idx = SCHED_LAT_1000_5000;
else if (msecs < 10000)
idx = SCHED_LAT_5000_10000;
else
idx = SCHED_LAT_10000_INF;
return idx;
}
/* track CPU usage of a group of tasks and its child groups */ /* track CPU usage of a group of tasks and its child groups */
struct cpuacct { struct cpuacct {
struct cgroup_subsys_state css; struct cgroup_subsys_state css;
...@@ -40,6 +106,7 @@ struct cpuacct { ...@@ -40,6 +106,7 @@ struct cpuacct {
struct cpuacct_usage __percpu *cpuusage; struct cpuacct_usage __percpu *cpuusage;
#ifdef CONFIG_SCHED_SLI #ifdef CONFIG_SCHED_SLI
struct cpuacct_alistats __percpu *alistats; struct cpuacct_alistats __percpu *alistats;
struct sched_cgroup_lat_stat_cpu __percpu *lat_stat_cpu;
#endif #endif
struct kernel_cpustat __percpu *cpustat; struct kernel_cpustat __percpu *cpustat;
...@@ -68,16 +135,90 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) ...@@ -68,16 +135,90 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca)
static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage); static DEFINE_PER_CPU(struct cpuacct_usage, root_cpuacct_cpuusage);
#ifdef CONFIG_SCHED_SLI #ifdef CONFIG_SCHED_SLI
static DEFINE_PER_CPU(struct cpuacct_alistats, root_alistats); static DEFINE_PER_CPU(struct cpuacct_alistats, root_alistats);
static DEFINE_PER_CPU(struct sched_cgroup_lat_stat_cpu, root_lat_stat_cpu);
#endif #endif
static struct cpuacct root_cpuacct = { static struct cpuacct root_cpuacct = {
.cpustat = &kernel_cpustat, .cpustat = &kernel_cpustat,
.cpuusage = &root_cpuacct_cpuusage, .cpuusage = &root_cpuacct_cpuusage,
#ifdef CONFIG_SCHED_SLI #ifdef CONFIG_SCHED_SLI
.alistats = &root_alistats, .alistats = &root_alistats,
.lat_stat_cpu = &root_lat_stat_cpu,
#endif #endif
}; };
#ifdef CONFIG_SCHED_SLI #ifdef CONFIG_SCHED_SLI
static DEFINE_STATIC_KEY_TRUE(cpuacct_no_sched_lat);
static int cpuacct_sched_lat_enabled_show(struct seq_file *m, void *v)
{
seq_printf(m, "%d\n", !static_key_enabled(&cpuacct_no_sched_lat));
return 0;
}
static int cpuacct_sched_lat_enabled_open(struct inode *inode,
struct file *file)
{
return single_open(file, cpuacct_sched_lat_enabled_show, NULL);
}
static ssize_t cpuacct_sched_lat_enabled_write(struct file *file,
const char __user *ubuf,
size_t count, loff_t *ppos)
{
char val = -1;
int ret = count;
if (count < 1 || *ppos) {
ret = -EINVAL;
goto out;
}
if (copy_from_user(&val, ubuf, 1)) {
ret = -EFAULT;
goto out;
}
switch (val) {
case '0':
static_branch_enable(&cpuacct_no_sched_lat);
break;
case '1':
static_branch_disable(&cpuacct_no_sched_lat);
break;
default:
ret = -EINVAL;
}
out:
return ret;
}
static const struct file_operations cpuacct_sched_lat_enabled_fops = {
.open = cpuacct_sched_lat_enabled_open,
.read = seq_read,
.write = cpuacct_sched_lat_enabled_write,
.llseek = seq_lseek,
.release = single_release,
};
static int __init init_cpuacct_sched_lat_enabled(void)
{
struct proc_dir_entry *ca_dir, *sched_lat_enabled_file;
ca_dir = proc_mkdir("cpusli", NULL);
if (!ca_dir)
return -ENOMEM;
sched_lat_enabled_file = proc_create("sched_lat_enabled", 0600,
ca_dir, &cpuacct_sched_lat_enabled_fops);
if (!sched_lat_enabled_file) {
remove_proc_entry("cpusli", NULL);
return -ENOMEM;
}
return 0;
}
__initcall(init_cpuacct_sched_lat_enabled);
void task_ca_increase_nr_migrations(struct task_struct *tsk) void task_ca_increase_nr_migrations(struct task_struct *tsk)
{ {
struct cpuacct *ca; struct cpuacct *ca;
...@@ -87,6 +228,25 @@ void task_ca_increase_nr_migrations(struct task_struct *tsk) ...@@ -87,6 +228,25 @@ void task_ca_increase_nr_migrations(struct task_struct *tsk)
this_cpu_ptr(ca->alistats)->nr_migrations++; this_cpu_ptr(ca->alistats)->nr_migrations++;
rcu_read_unlock(); rcu_read_unlock();
} }
void cpuacct_update_latency(struct task_struct *tsk, u64 delta)
{
enum sched_lat_count_t idx;
struct cpuacct *ca;
unsigned int msecs;
if (static_branch_likely(&cpuacct_no_sched_lat))
return;
rcu_read_lock();
ca = task_ca(tsk);
msecs = delta >> 20; /* Proximately to speed up */
idx = get_sched_lat_count_idx(msecs);
this_cpu_inc(ca->lat_stat_cpu->item[SCHED_LAT_WAIT][idx]);
this_cpu_add(ca->lat_stat_cpu->item[SCHED_LAT_WAIT][SCHED_LAT_TOTAL],
delta);
rcu_read_unlock();
}
#endif #endif
/* Create a new CPU accounting group */ /* Create a new CPU accounting group */
...@@ -115,6 +275,10 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -115,6 +275,10 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
ca->alistats = alloc_percpu(struct cpuacct_alistats); ca->alistats = alloc_percpu(struct cpuacct_alistats);
if (!ca->alistats) if (!ca->alistats)
goto out_free_cpustat; goto out_free_cpustat;
ca->lat_stat_cpu = alloc_percpu(struct sched_cgroup_lat_stat_cpu);
if (!ca->lat_stat_cpu)
goto out_free_alistats;
#endif #endif
for_each_possible_cpu(i) { for_each_possible_cpu(i) {
...@@ -125,6 +289,8 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -125,6 +289,8 @@ cpuacct_css_alloc(struct cgroup_subsys_state *parent_css)
return &ca->css; return &ca->css;
#ifdef CONFIG_SCHED_SLI #ifdef CONFIG_SCHED_SLI
out_free_alistats:
free_percpu(ca->alistats);
out_free_cpustat: out_free_cpustat:
free_percpu(ca->cpustat); free_percpu(ca->cpustat);
#endif #endif
...@@ -145,6 +311,7 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css) ...@@ -145,6 +311,7 @@ static void cpuacct_css_free(struct cgroup_subsys_state *css)
free_percpu(ca->cpuusage); free_percpu(ca->cpuusage);
#ifdef CONFIG_SCHED_SLI #ifdef CONFIG_SCHED_SLI
free_percpu(ca->alistats); free_percpu(ca->alistats);
free_percpu(ca->lat_stat_cpu);
#endif #endif
kfree(ca); kfree(ca);
} }
...@@ -601,6 +768,83 @@ static int cpuacct_proc_stats_show(struct seq_file *sf, void *v) ...@@ -601,6 +768,83 @@ static int cpuacct_proc_stats_show(struct seq_file *sf, void *v)
return 0; return 0;
} }
#define SCHED_LAT_STAT_SMP_WRITE(name, sidx) \
static void smp_write_##name(void *info) \
{ \
struct cpuacct *ca = (struct cpuacct *)info; \
int i; \
\
for (i = SCHED_LAT_0_1; i < SCHED_LAT_NR_COUNT; i++) \
this_cpu_write(ca->lat_stat_cpu->item[sidx][i], 0); \
} \
SCHED_LAT_STAT_SMP_WRITE(sched_wait_latency, SCHED_LAT_WAIT);
smp_call_func_t smp_sched_lat_write_funcs[] = {
smp_write_sched_wait_latency
};
static int sched_lat_stat_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
struct cpuacct *ca = css_ca(css);
enum sched_lat_stat_item idx = cft->private;
smp_call_func_t func = smp_sched_lat_write_funcs[idx];
if (val != 0)
return -EINVAL;
func((void *)ca);
smp_call_function(func, (void *)ca, 1);
return 0;
}
static u64 sched_lat_stat_gather(struct cpuacct *ca,
enum sched_lat_stat_item sidx,
enum sched_lat_count_t cidx)
{
u64 sum = 0;
int cpu;
for_each_possible_cpu(cpu)
sum += per_cpu_ptr(ca->lat_stat_cpu, cpu)->item[sidx][cidx];
return sum;
}
static int sched_lat_stat_show(struct seq_file *sf, void *v)
{
struct cpuacct *ca = css_ca(seq_css(sf));
enum sched_lat_stat_item s = seq_cft(sf)->private;
/* CFS scheduling latency cgroup and task histgrams */
seq_printf(sf, "0-1ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_0_1));
seq_printf(sf, "1-4ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_1_4));
seq_printf(sf, "4-7ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_4_7));
seq_printf(sf, "7-10ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_7_10));
seq_printf(sf, "10-100ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_10_100));
seq_printf(sf, "100-500ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_100_500));
seq_printf(sf, "500-1000ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_500_1000));
seq_printf(sf, "1000-5000ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_1000_5000));
seq_printf(sf, "5000-10000ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_5000_10000));
seq_printf(sf, ">=10000ms: \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_10000_INF));
seq_printf(sf, "total(ms): \t%llu\n",
sched_lat_stat_gather(ca, s, SCHED_LAT_TOTAL) / 1000000);
return 0;
}
#endif #endif
static struct cftype files[] = { static struct cftype files[] = {
...@@ -642,6 +886,12 @@ static struct cftype files[] = { ...@@ -642,6 +886,12 @@ static struct cftype files[] = {
.name = "proc_stat", .name = "proc_stat",
.seq_show = cpuacct_proc_stats_show, .seq_show = cpuacct_proc_stats_show,
}, },
{
.name = "wait_latency",
.private = SCHED_LAT_WAIT,
.write_u64 = sched_lat_stat_write,
.seq_show = sched_lat_stat_show
},
#endif #endif
{ } /* terminate */ { } /* terminate */
}; };
......
...@@ -890,6 +890,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) ...@@ -890,6 +890,7 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
return; return;
} }
trace_sched_stat_wait(p, delta); trace_sched_stat_wait(p, delta);
cpuacct_update_latency(p, delta);
} }
__schedstat_set(se->statistics.wait_max, __schedstat_set(se->statistics.wait_max,
......
...@@ -2282,8 +2282,11 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned ...@@ -2282,8 +2282,11 @@ unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned
extern u64 get_idle_time(int cpu); extern u64 get_idle_time(int cpu);
extern u64 get_iowait_time(int cpu); extern u64 get_iowait_time(int cpu);
extern void task_ca_increase_nr_migrations(struct task_struct *tsk); extern void task_ca_increase_nr_migrations(struct task_struct *tsk);
void cpuacct_update_latency(struct task_struct *tsk, u64 delta);
#else #else
static inline void task_ca_increase_nr_migrations(struct task_struct *tsk) { } static inline void task_ca_increase_nr_migrations(struct task_struct *tsk) { }
static inline void cpuacct_update_latency(struct task_struct *tsk,
u64 delta) { }
#endif #endif
#ifdef CONFIG_PSI #ifdef CONFIG_PSI
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册