提交 61e58859 编写于 作者: Y Yihao Wu

alinux: sched: Introduce per-cgroup idle accounting

to #26424323

Since we concern idle, let's take idle as the center state. And omit
transition between other stats. Below is the state transition graph:

                                sleep->deque
+-----------+ cpumask +-------+ exit->deque +-------+
|ineffective|-------- | idle  | <-----------|running|
+-----------+         +-------+             +-------+
                        ^ |
 unthrtl child -> deque | |
          wake -> deque | |thrtl chlid -> enque
       migrate -> deque | |migrate -> enque
                        | v
                      +-------+
                      | steal |
                      +-------+

We conclude idle state condition as:

!se->on_rq && !my_q->throttled && cpu allowed.

From this graph and condition, we can hook (de|en)queue_task_fair
update_cpumasks_hier, (un|)throttle_cfs_rq to account idle state.

In the hooked functions, we also check the conditions, to avoid
accounting unwanted cpu clocks.
Signed-off-by: NYihao Wu <wuyihao@linux.alibaba.com>
Signed-off-by: NShanpei Chen <shanpeic@linux.alibaba.com>
Acked-by: NMichael Wang <yun.wang@linux.alibaba.com>
上级 965d75d3
......@@ -902,4 +902,16 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns)
free_cgroup_ns(ns);
}
#ifdef CONFIG_SCHED_SLI
void cpuacct_cpuset_changed(struct cgroup *cgrp,
struct cpumask *effective, struct cpumask *new_added);
void cgroup_idle_end(struct sched_entity *se);
void cgroup_idle_start(struct sched_entity *se);
#else
static inline void cpuacct_cpuset_changed(struct cgroup *cgrp,
struct cpumask *effective, struct cpumask *new_added) { }
static inline void cgroup_idle_end(struct sched_entity *se) { }
static inline void cgroup_idle_start(struct sched_entity *se) { }
#endif
#endif /* _LINUX_CGROUP_H */
......@@ -462,6 +462,10 @@ struct sched_entity {
u64 vruntime;
u64 prev_sum_exec_runtime;
u64 cg_idle_start;
u64 cg_idle_sum;
seqcount_t idle_seqcount;
u64 nr_migrations;
struct sched_statistics statistics;
......
......@@ -896,6 +896,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
{
struct cpuset *cp;
struct cpumask added, deleted, old_cpus;
struct cgroup_subsys_state *pos_css;
bool need_rebuild_sched_domains = false;
......@@ -912,6 +913,11 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
if (is_in_v2_mode() && cpumask_empty(new_cpus))
cpumask_copy(new_cpus, parent->effective_cpus);
if (cpumask_empty(cp->effective_cpus))
cpumask_copy(&old_cpus, parent->effective_cpus);
else
cpumask_copy(&old_cpus, cp->effective_cpus);
/* Skip the whole subtree if the cpumask remains the same. */
if (cpumask_equal(new_cpus, cp->effective_cpus)) {
pos_css = css_rightmost_descendant(pos_css);
......@@ -929,8 +935,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
WARN_ON(!is_in_v2_mode() &&
!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
/* add = new - old = new & (~old) */
cpumask_andnot(&added, new_cpus, &old_cpus);
cpuacct_cpuset_changed(cs->css.cgroup, NULL, &added);
update_tasks_cpumask(cp);
/* deleted = old - new = old & (~new) */
cpumask_andnot(&deleted, &old_cpus, new_cpus);
cpuacct_cpuset_changed(cs->css.cgroup, &deleted, NULL);
/*
* If the effective cpumask of any non-empty cpuset is changed,
* we need to rebuild sched domains.
......@@ -2026,6 +2040,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
cpuacct_cpuset_changed(cs->css.cgroup, NULL, cs->effective_cpus);
spin_unlock_irq(&callback_lock);
out_unlock:
mutex_unlock(&cpuset_mutex);
......
......@@ -445,6 +445,74 @@ static unsigned long ca_uninterruptible(struct cpuacct *ca, int cpu)
return nr;
}
void cgroup_idle_start(struct sched_entity *se)
{
u64 clock;
if (!schedstat_enabled())
return;
clock = __rq_clock_broken(se->cfs_rq->rq);
write_seqcount_begin(&se->idle_seqcount);
__schedstat_set(se->cg_idle_start, clock);
write_seqcount_end(&se->idle_seqcount);
}
void cgroup_idle_end(struct sched_entity *se)
{
u64 clock;
u64 idle_start;
if (!schedstat_enabled())
return;
clock = __rq_clock_broken(se->cfs_rq->rq);
write_seqcount_begin(&se->idle_seqcount);
idle_start = schedstat_val(se->cg_idle_start);
__schedstat_add(se->cg_idle_sum, clock - idle_start);
__schedstat_set(se->cg_idle_start, 0);
write_seqcount_end(&se->idle_seqcount);
}
void cpuacct_cpuset_changed(struct cgroup *cgrp, struct cpumask *deleted,
struct cpumask *added)
{
struct task_group *tg;
struct sched_entity *se;
int cpu;
if (!schedstat_enabled())
return;
rcu_read_lock();
tg = cgroup_tg(cgrp);
if (!tg) {
rcu_read_unlock();
return;
}
if (added) {
/* Mark newly added cpus as newly-idle */
for_each_cpu(cpu, added) {
se = tg->se[cpu];
cgroup_idle_start(se);
}
}
if (deleted) {
/* Mark ineffective_cpus as idle-invalid */
for_each_cpu(cpu, deleted) {
se = tg->se[cpu];
cgroup_idle_end(se);
}
}
rcu_read_unlock();
}
static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
struct task_group *tg, struct cpuacct_usage_result *res)
{
......@@ -484,10 +552,24 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
res->irq = kcpustat->cpustat[CPUTIME_IRQ];
res->softirq = kcpustat->cpustat[CPUTIME_SOFTIRQ];
if (se)
res->steal = se->statistics.wait_sum;
else
if (se && schedstat_enabled()) {
unsigned int seq;
u64 idle_start;
u64 clock = cpu_clock(cpu);
do {
seq = read_seqcount_begin(&se->idle_seqcount);
res->idle = schedstat_val(se->cg_idle_sum);
idle_start = schedstat_val(se->cg_idle_start);
clock = cpu_clock(cpu);
if (idle_start && clock > idle_start)
res->idle += clock - idle_start;
} while (read_seqcount_retry(&se->idle_seqcount, seq));
res->steal = 0;
} else {
res->idle = res->iowait = res->steal = 0;
}
res->guest = kcpustat->cpustat[CPUTIME_GUEST];
res->guest_nice = kcpustat->cpustat[CPUTIME_GUEST_NICE];
}
......
......@@ -4465,9 +4465,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
break;
if (dequeue)
if (dequeue) {
if (se->my_q != cfs_rq)
cgroup_idle_start(se);
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
}
qcfs_rq->h_nr_running -= task_delta;
if (qcfs_rq->load.weight)
......@@ -4504,6 +4506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
{
struct cfs_rq *bottom_cfs_rq = cfs_rq;
struct rq *rq = rq_of(cfs_rq);
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
struct sched_entity *se;
......@@ -4533,8 +4536,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
enqueue = 0;
cfs_rq = cfs_rq_of(se);
if (enqueue)
if (enqueue) {
if (se->my_q != bottom_cfs_rq)
cgroup_idle_end(se);
enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
}
cfs_rq->h_nr_running += task_delta;
if (cfs_rq_throttled(cfs_rq))
......@@ -5132,14 +5138,22 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se);
enqueue_entity(cfs_rq, se, flags);
if (!entity_is_task(se))
cgroup_idle_end(se);
/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running increment below.
*/
if (cfs_rq_throttled(cfs_rq))
if (cfs_rq_throttled(cfs_rq)) {
#ifdef CONFIG_FAIR_GROUP_SCHED
if (cfs_rq->nr_running == 1)
cgroup_idle_end(se->parent);
#endif
break;
}
cfs_rq->h_nr_running++;
flags = ENQUEUE_WAKEUP;
......@@ -5179,14 +5193,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
if (!entity_is_task(se))
cgroup_idle_start(se);
/*
* end evaluation on encountering a throttled cfs_rq
*
* note: in the case of encountering a throttled cfs_rq we will
* post the final h_nr_running decrement below.
*/
if (cfs_rq_throttled(cfs_rq))
if (cfs_rq_throttled(cfs_rq)) {
#ifdef CONFIG_FAIR_GROUP_SCHED
if (!cfs_rq->nr_running)
cgroup_idle_start(se->parent);
#endif
break;
}
cfs_rq->h_nr_running--;
/* Don't dequeue parent if it has other entities besides us */
......@@ -10162,6 +10184,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
/* guarantee group entities always have weight */
update_load_set(&se->load, NICE_0_LOAD);
se->parent = parent;
seqcount_init(&se->idle_seqcount);
se->cg_idle_start = cpu_clock(cpu);
}
static DEFINE_MUTEX(shares_mutex);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册