From 61e5885959be9af0b6942ec2a3ea0e7ccc48bf05 Mon Sep 17 00:00:00 2001 From: Yihao Wu Date: Tue, 10 Mar 2020 17:02:14 +0800 Subject: [PATCH] alinux: sched: Introduce per-cgroup idle accounting to #26424323 Since we concern idle, let's take idle as the center state. And omit transition between other stats. Below is the state transition graph: sleep->deque +-----------+ cpumask +-------+ exit->deque +-------+ |ineffective|-------- | idle | <-----------|running| +-----------+ +-------+ +-------+ ^ | unthrtl child -> deque | | wake -> deque | |thrtl chlid -> enque migrate -> deque | |migrate -> enque | v +-------+ | steal | +-------+ We conclude idle state condition as: !se->on_rq && !my_q->throttled && cpu allowed. From this graph and condition, we can hook (de|en)queue_task_fair update_cpumasks_hier, (un|)throttle_cfs_rq to account idle state. In the hooked functions, we also check the conditions, to avoid accounting unwanted cpu clocks. Signed-off-by: Yihao Wu Signed-off-by: Shanpei Chen Acked-by: Michael Wang --- include/linux/cgroup.h | 12 ++++++ include/linux/sched.h | 4 ++ kernel/cgroup/cpuset.c | 15 +++++++ kernel/sched/cpuacct.c | 88 ++++++++++++++++++++++++++++++++++++++++-- kernel/sched/fair.c | 34 +++++++++++++--- 5 files changed, 145 insertions(+), 8 deletions(-) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index dbe0d5d4f36a..c513278b274c 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -902,4 +902,16 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns) free_cgroup_ns(ns); } +#ifdef CONFIG_SCHED_SLI +void cpuacct_cpuset_changed(struct cgroup *cgrp, + struct cpumask *effective, struct cpumask *new_added); +void cgroup_idle_end(struct sched_entity *se); +void cgroup_idle_start(struct sched_entity *se); +#else +static inline void cpuacct_cpuset_changed(struct cgroup *cgrp, + struct cpumask *effective, struct cpumask *new_added) { } +static inline void cgroup_idle_end(struct sched_entity *se) { } +static inline void cgroup_idle_start(struct sched_entity *se) { } +#endif + #endif /* _LINUX_CGROUP_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 6253fdc62623..c111dcf60064 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -462,6 +462,10 @@ struct sched_entity { u64 vruntime; u64 prev_sum_exec_runtime; + u64 cg_idle_start; + u64 cg_idle_sum; + seqcount_t idle_seqcount; + u64 nr_migrations; struct sched_statistics statistics; diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index ff956ccbb6df..290177bd7466 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -896,6 +896,7 @@ static void update_tasks_cpumask(struct cpuset *cs) static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) { struct cpuset *cp; + struct cpumask added, deleted, old_cpus; struct cgroup_subsys_state *pos_css; bool need_rebuild_sched_domains = false; @@ -912,6 +913,11 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) if (is_in_v2_mode() && cpumask_empty(new_cpus)) cpumask_copy(new_cpus, parent->effective_cpus); + if (cpumask_empty(cp->effective_cpus)) + cpumask_copy(&old_cpus, parent->effective_cpus); + else + cpumask_copy(&old_cpus, cp->effective_cpus); + /* Skip the whole subtree if the cpumask remains the same. */ if (cpumask_equal(new_cpus, cp->effective_cpus)) { pos_css = css_rightmost_descendant(pos_css); @@ -929,8 +935,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus) WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); + /* add = new - old = new & (~old) */ + cpumask_andnot(&added, new_cpus, &old_cpus); + cpuacct_cpuset_changed(cs->css.cgroup, NULL, &added); + update_tasks_cpumask(cp); + /* deleted = old - new = old & (~new) */ + cpumask_andnot(&deleted, &old_cpus, new_cpus); + cpuacct_cpuset_changed(cs->css.cgroup, &deleted, NULL); + /* * If the effective cpumask of any non-empty cpuset is changed, * we need to rebuild sched domains. @@ -2026,6 +2040,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); + cpuacct_cpuset_changed(cs->css.cgroup, NULL, cs->effective_cpus); spin_unlock_irq(&callback_lock); out_unlock: mutex_unlock(&cpuset_mutex); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index bda1c626ccb9..4b640ce8766c 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -445,6 +445,74 @@ static unsigned long ca_uninterruptible(struct cpuacct *ca, int cpu) return nr; } +void cgroup_idle_start(struct sched_entity *se) +{ + u64 clock; + + if (!schedstat_enabled()) + return; + + clock = __rq_clock_broken(se->cfs_rq->rq); + + write_seqcount_begin(&se->idle_seqcount); + __schedstat_set(se->cg_idle_start, clock); + write_seqcount_end(&se->idle_seqcount); +} + +void cgroup_idle_end(struct sched_entity *se) +{ + u64 clock; + u64 idle_start; + + if (!schedstat_enabled()) + return; + + clock = __rq_clock_broken(se->cfs_rq->rq); + + write_seqcount_begin(&se->idle_seqcount); + idle_start = schedstat_val(se->cg_idle_start); + __schedstat_add(se->cg_idle_sum, clock - idle_start); + __schedstat_set(se->cg_idle_start, 0); + write_seqcount_end(&se->idle_seqcount); +} + +void cpuacct_cpuset_changed(struct cgroup *cgrp, struct cpumask *deleted, + struct cpumask *added) +{ + struct task_group *tg; + struct sched_entity *se; + int cpu; + + if (!schedstat_enabled()) + return; + + rcu_read_lock(); + tg = cgroup_tg(cgrp); + + if (!tg) { + rcu_read_unlock(); + return; + } + + if (added) { + /* Mark newly added cpus as newly-idle */ + for_each_cpu(cpu, added) { + se = tg->se[cpu]; + cgroup_idle_start(se); + } + } + + if (deleted) { + /* Mark ineffective_cpus as idle-invalid */ + for_each_cpu(cpu, deleted) { + se = tg->se[cpu]; + cgroup_idle_end(se); + } + } + + rcu_read_unlock(); +} + static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu, struct task_group *tg, struct cpuacct_usage_result *res) { @@ -484,10 +552,24 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu, res->irq = kcpustat->cpustat[CPUTIME_IRQ]; res->softirq = kcpustat->cpustat[CPUTIME_SOFTIRQ]; - if (se) - res->steal = se->statistics.wait_sum; - else + if (se && schedstat_enabled()) { + unsigned int seq; + u64 idle_start; + u64 clock = cpu_clock(cpu); + + do { + seq = read_seqcount_begin(&se->idle_seqcount); + res->idle = schedstat_val(se->cg_idle_sum); + idle_start = schedstat_val(se->cg_idle_start); + clock = cpu_clock(cpu); + if (idle_start && clock > idle_start) + res->idle += clock - idle_start; + } while (read_seqcount_retry(&se->idle_seqcount, seq)); + res->steal = 0; + } else { + res->idle = res->iowait = res->steal = 0; + } res->guest = kcpustat->cpustat[CPUTIME_GUEST]; res->guest_nice = kcpustat->cpustat[CPUTIME_GUEST_NICE]; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 90295a16b631..b373dd7a6c58 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4465,9 +4465,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) break; - - if (dequeue) + if (dequeue) { + if (se->my_q != cfs_rq) + cgroup_idle_start(se); dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); + } qcfs_rq->h_nr_running -= task_delta; if (qcfs_rq->load.weight) @@ -4504,6 +4506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) { + struct cfs_rq *bottom_cfs_rq = cfs_rq; struct rq *rq = rq_of(cfs_rq); struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); struct sched_entity *se; @@ -4533,8 +4536,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue = 0; cfs_rq = cfs_rq_of(se); - if (enqueue) + if (enqueue) { + if (se->my_q != bottom_cfs_rq) + cgroup_idle_end(se); enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); + } cfs_rq->h_nr_running += task_delta; if (cfs_rq_throttled(cfs_rq)) @@ -5132,14 +5138,22 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); enqueue_entity(cfs_rq, se, flags); + if (!entity_is_task(se)) + cgroup_idle_end(se); + /* * end evaluation on encountering a throttled cfs_rq * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running increment below. */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(cfs_rq)) { +#ifdef CONFIG_FAIR_GROUP_SCHED + if (cfs_rq->nr_running == 1) + cgroup_idle_end(se->parent); +#endif break; + } cfs_rq->h_nr_running++; flags = ENQUEUE_WAKEUP; @@ -5179,14 +5193,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); + if (!entity_is_task(se)) + cgroup_idle_start(se); + /* * end evaluation on encountering a throttled cfs_rq * * note: in the case of encountering a throttled cfs_rq we will * post the final h_nr_running decrement below. */ - if (cfs_rq_throttled(cfs_rq)) + if (cfs_rq_throttled(cfs_rq)) { +#ifdef CONFIG_FAIR_GROUP_SCHED + if (!cfs_rq->nr_running) + cgroup_idle_start(se->parent); +#endif break; + } cfs_rq->h_nr_running--; /* Don't dequeue parent if it has other entities besides us */ @@ -10162,6 +10184,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, /* guarantee group entities always have weight */ update_load_set(&se->load, NICE_0_LOAD); se->parent = parent; + seqcount_init(&se->idle_seqcount); + se->cg_idle_start = cpu_clock(cpu); } static DEFINE_MUTEX(shares_mutex); -- GitLab