提交 61e58859 编写于 作者: Y Yihao Wu

alinux: sched: Introduce per-cgroup idle accounting

to #26424323

Since we concern idle, let's take idle as the center state. And omit
transition between other stats. Below is the state transition graph:

                                sleep->deque
+-----------+ cpumask +-------+ exit->deque +-------+
|ineffective|-------- | idle  | <-----------|running|
+-----------+         +-------+             +-------+
                        ^ |
 unthrtl child -> deque | |
          wake -> deque | |thrtl chlid -> enque
       migrate -> deque | |migrate -> enque
                        | v
                      +-------+
                      | steal |
                      +-------+

We conclude idle state condition as:

!se->on_rq && !my_q->throttled && cpu allowed.

From this graph and condition, we can hook (de|en)queue_task_fair
update_cpumasks_hier, (un|)throttle_cfs_rq to account idle state.

In the hooked functions, we also check the conditions, to avoid
accounting unwanted cpu clocks.
Signed-off-by: NYihao Wu <wuyihao@linux.alibaba.com>
Signed-off-by: NShanpei Chen <shanpeic@linux.alibaba.com>
Acked-by: NMichael Wang <yun.wang@linux.alibaba.com>
上级 965d75d3
...@@ -902,4 +902,16 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) ...@@ -902,4 +902,16 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns)
free_cgroup_ns(ns); free_cgroup_ns(ns);
} }
#ifdef CONFIG_SCHED_SLI
void cpuacct_cpuset_changed(struct cgroup *cgrp,
struct cpumask *effective, struct cpumask *new_added);
void cgroup_idle_end(struct sched_entity *se);
void cgroup_idle_start(struct sched_entity *se);
#else
static inline void cpuacct_cpuset_changed(struct cgroup *cgrp,
struct cpumask *effective, struct cpumask *new_added) { }
static inline void cgroup_idle_end(struct sched_entity *se) { }
static inline void cgroup_idle_start(struct sched_entity *se) { }
#endif
#endif /* _LINUX_CGROUP_H */ #endif /* _LINUX_CGROUP_H */
...@@ -462,6 +462,10 @@ struct sched_entity { ...@@ -462,6 +462,10 @@ struct sched_entity {
u64 vruntime; u64 vruntime;
u64 prev_sum_exec_runtime; u64 prev_sum_exec_runtime;
u64 cg_idle_start;
u64 cg_idle_sum;
seqcount_t idle_seqcount;
u64 nr_migrations; u64 nr_migrations;
struct sched_statistics statistics; struct sched_statistics statistics;
......
...@@ -896,6 +896,7 @@ static void update_tasks_cpumask(struct cpuset *cs) ...@@ -896,6 +896,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{ {
struct cpuset *cp; struct cpuset *cp;
struct cpumask added, deleted, old_cpus;
struct cgroup_subsys_state *pos_css; struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false; bool need_rebuild_sched_domains = false;
...@@ -912,6 +913,11 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) ...@@ -912,6 +913,11 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
if (is_in_v2_mode() && cpumask_empty(new_cpus)) if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus); cpumask_copy(new_cpus, parent->effective_cpus);
if (cpumask_empty(cp->effective_cpus))
cpumask_copy(&old_cpus, parent->effective_cpus);
else
cpumask_copy(&old_cpus, cp->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */ /* Skip the whole subtree if the cpumask remains the same. */
if (cpumask_equal(new_cpus, cp->effective_cpus)) { if (cpumask_equal(new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css); pos_css = css_rightmost_descendant(pos_css);
...@@ -929,8 +935,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) ...@@ -929,8 +935,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
WARN_ON(!is_in_v2_mode() && WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); !cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
/* add = new - old = new & (~old) */
cpumask_andnot(&added, new_cpus, &old_cpus);
cpuacct_cpuset_changed(cs->css.cgroup, NULL, &added);
update_tasks_cpumask(cp); update_tasks_cpumask(cp);
/* deleted = old - new = old & (~new) */
cpumask_andnot(&deleted, &old_cpus, new_cpus);
cpuacct_cpuset_changed(cs->css.cgroup, &deleted, NULL);
/* /*
* If the effective cpumask of any non-empty cpuset is changed, * If the effective cpumask of any non-empty cpuset is changed,
* we need to rebuild sched domains. * we need to rebuild sched domains.
...@@ -2026,6 +2040,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) ...@@ -2026,6 +2040,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->mems_allowed; cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
cpuacct_cpuset_changed(cs->css.cgroup, NULL, cs->effective_cpus);
spin_unlock_irq(&callback_lock); spin_unlock_irq(&callback_lock);
out_unlock: out_unlock:
mutex_unlock(&cpuset_mutex); mutex_unlock(&cpuset_mutex);
......
...@@ -445,6 +445,74 @@ static unsigned long ca_uninterruptible(struct cpuacct *ca, int cpu) ...@@ -445,6 +445,74 @@ static unsigned long ca_uninterruptible(struct cpuacct *ca, int cpu)
return nr; return nr;
} }
void cgroup_idle_start(struct sched_entity *se)
{
u64 clock;
if (!schedstat_enabled())
return;
clock = __rq_clock_broken(se->cfs_rq->rq);
write_seqcount_begin(&se->idle_seqcount);
__schedstat_set(se->cg_idle_start, clock);
write_seqcount_end(&se->idle_seqcount);
}
void cgroup_idle_end(struct sched_entity *se)
{
u64 clock;
u64 idle_start;
if (!schedstat_enabled())
return;
clock = __rq_clock_broken(se->cfs_rq->rq);
write_seqcount_begin(&se->idle_seqcount);
idle_start = schedstat_val(se->cg_idle_start);
__schedstat_add(se->cg_idle_sum, clock - idle_start);
__schedstat_set(se->cg_idle_start, 0);
write_seqcount_end(&se->idle_seqcount);
}
void cpuacct_cpuset_changed(struct cgroup *cgrp, struct cpumask *deleted,
struct cpumask *added)
{
struct task_group *tg;
struct sched_entity *se;
int cpu;
if (!schedstat_enabled())
return;
rcu_read_lock();
tg = cgroup_tg(cgrp);
if (!tg) {
rcu_read_unlock();
return;
}
if (added) {
/* Mark newly added cpus as newly-idle */
for_each_cpu(cpu, added) {
se = tg->se[cpu];
cgroup_idle_start(se);
}
}
if (deleted) {
/* Mark ineffective_cpus as idle-invalid */
for_each_cpu(cpu, deleted) {
se = tg->se[cpu];
cgroup_idle_end(se);
}
}
rcu_read_unlock();
}
static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu, static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
struct task_group *tg, struct cpuacct_usage_result *res) struct task_group *tg, struct cpuacct_usage_result *res)
{ {
...@@ -484,10 +552,24 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu, ...@@ -484,10 +552,24 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
res->irq = kcpustat->cpustat[CPUTIME_IRQ]; res->irq = kcpustat->cpustat[CPUTIME_IRQ];
res->softirq = kcpustat->cpustat[CPUTIME_SOFTIRQ]; res->softirq = kcpustat->cpustat[CPUTIME_SOFTIRQ];
if (se) if (se && schedstat_enabled()) {
res->steal = se->statistics.wait_sum; unsigned int seq;
else u64 idle_start;
u64 clock = cpu_clock(cpu);
do {
seq = read_seqcount_begin(&se->idle_seqcount);
res->idle = schedstat_val(se->cg_idle_sum);
idle_start = schedstat_val(se->cg_idle_start);
clock = cpu_clock(cpu);
if (idle_start && clock > idle_start)
res->idle += clock - idle_start;
} while (read_seqcount_retry(&se->idle_seqcount, seq));
res->steal = 0; res->steal = 0;
} else {
res->idle = res->iowait = res->steal = 0;
}
res->guest = kcpustat->cpustat[CPUTIME_GUEST]; res->guest = kcpustat->cpustat[CPUTIME_GUEST];
res->guest_nice = kcpustat->cpustat[CPUTIME_GUEST_NICE]; res->guest_nice = kcpustat->cpustat[CPUTIME_GUEST_NICE];
} }
......
...@@ -4465,9 +4465,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -4465,9 +4465,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
/* throttled entity or throttle-on-deactivate */ /* throttled entity or throttle-on-deactivate */
if (!se->on_rq) if (!se->on_rq)
break; break;
if (dequeue) {
if (dequeue) if (se->my_q != cfs_rq)
cgroup_idle_start(se);
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
}
qcfs_rq->h_nr_running -= task_delta; qcfs_rq->h_nr_running -= task_delta;
if (qcfs_rq->load.weight) if (qcfs_rq->load.weight)
...@@ -4504,6 +4506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -4504,6 +4506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{ {
struct cfs_rq *bottom_cfs_rq = cfs_rq;
struct rq *rq = rq_of(cfs_rq); struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se; struct sched_entity *se;
...@@ -4533,8 +4536,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) ...@@ -4533,8 +4536,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
enqueue = 0; enqueue = 0;
cfs_rq = cfs_rq_of(se); cfs_rq = cfs_rq_of(se);
if (enqueue) if (enqueue) {
if (se->my_q != bottom_cfs_rq)
cgroup_idle_end(se);
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
}
cfs_rq->h_nr_running += task_delta; cfs_rq->h_nr_running += task_delta;
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(cfs_rq))
...@@ -5132,14 +5138,22 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -5132,14 +5138,22 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se); cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags); enqueue_entity(cfs_rq, se, flags);
if (!entity_is_task(se))
cgroup_idle_end(se);
/* /*
* end evaluation on encountering a throttled cfs_rq * end evaluation on encountering a throttled cfs_rq
* *
* note: in the case of encountering a throttled cfs_rq we will * note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below. * post the final h_nr_running increment below.
*/ */
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(cfs_rq)) {
#ifdef CONFIG_FAIR_GROUP_SCHED
if (cfs_rq->nr_running == 1)
cgroup_idle_end(se->parent);
#endif
break; break;
}
cfs_rq->h_nr_running++; cfs_rq->h_nr_running++;
flags = ENQUEUE_WAKEUP; flags = ENQUEUE_WAKEUP;
...@@ -5179,14 +5193,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ...@@ -5179,14 +5193,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se); cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags); dequeue_entity(cfs_rq, se, flags);
if (!entity_is_task(se))
cgroup_idle_start(se);
/* /*
* end evaluation on encountering a throttled cfs_rq * end evaluation on encountering a throttled cfs_rq
* *
* note: in the case of encountering a throttled cfs_rq we will * note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running decrement below. * post the final h_nr_running decrement below.
*/ */
if (cfs_rq_throttled(cfs_rq)) if (cfs_rq_throttled(cfs_rq)) {
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
cgroup_idle_start(se->parent);
#endif
break; break;
}
cfs_rq->h_nr_running--; cfs_rq->h_nr_running--;
/* Don't dequeue parent if it has other entities besides us */ /* Don't dequeue parent if it has other entities besides us */
...@@ -10162,6 +10184,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, ...@@ -10162,6 +10184,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
/* guarantee group entities always have weight */ /* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD); update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent; se->parent = parent;
seqcount_init(&se->idle_seqcount);
se->cg_idle_start = cpu_clock(cpu);
} }
static DEFINE_MUTEX(shares_mutex); static DEFINE_MUTEX(shares_mutex);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册