alinux: sched: Introduce per-cgroup idle accounting

to #26424323 Since we concern idle, let's take idle as the center state. And omit transition between other stats. Below is the state transition graph: sleep->deque +-----------+ cpumask +-------+ exit->deque +-------+ |ineffective|-------- | idle | <-----------|running| +-----------+ +-------+ +-------+ ^ | unthrtl child -> deque | | wake -> deque | |thrtl chlid -> enque migrate -> deque | |migrate -> enque | v +-------+ | steal | +-------+ We conclude idle state condition as: !se->on_rq && !my_q->throttled && cpu allowed. From this graph and condition, we can hook (de|en)queue_task_fair update_cpumasks_hier, (un|)throttle_cfs_rq to account idle state. In the hooked functions, we also check the conditions, to avoid accounting unwanted cpu clocks. Signed-off-by: N Yihao Wu <wuyihao@linux.alibaba.com> Signed-off-by: N Shanpei Chen <shanpeic@linux.alibaba.com> Acked-by: N Michael Wang <yun.wang@linux.alibaba.com>

alinux: sched: Introduce per-cgroup idle accounting
to #26424323 Since we concern idle, let's take idle as the center state. And omit transition between other stats. Below is the state transition graph: sleep->deque +-----------+ cpumask +-------+ exit->deque +-------+ |ineffective|-------- | idle | <-----------|running| +-----------+ +-------+ +-------+ ^ | unthrtl child -> deque | | wake -> deque | |thrtl chlid -> enque migrate -> deque | |migrate -> enque | v +-------+ | steal | +-------+ We conclude idle state condition as: !se->on_rq && !my_q->throttled && cpu allowed. From this graph and condition, we can hook (de|en)queue_task_fair update_cpumasks_hier, (un|)throttle_cfs_rq to account idle state. In the hooked functions, we also check the conditions, to avoid accounting unwanted cpu clocks. Signed-off-by: N Yihao Wu <wuyihao@linux.alibaba.com> Signed-off-by: N Shanpei Chen <shanpeic@linux.alibaba.com> Acked-by: N Michael Wang <yun.wang@linux.alibaba.com>
61e58859 · Yihao Wu · 965d75d3 · 61e58859 · 61e58859 · 61e58859
5 changed file
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -902,4 +902,16 @@ static inline void put_cgroup_ns(struct cgroup_namespace *ns)
 		free_cgroup_ns(ns);
 }
+#ifdef CONFIG_SCHED_SLI
+void cpuacct_cpuset_changed(struct cgroup *cgrp,
+		struct cpumask *effective, struct cpumask *new_added);
+void cgroup_idle_end(struct sched_entity *se);
+void cgroup_idle_start(struct sched_entity *se);
+#else
+static inline void cpuacct_cpuset_changed(struct cgroup *cgrp,
+		struct cpumask *effective, struct cpumask *new_added) { }
+static inline void cgroup_idle_end(struct sched_entity *se) { }
+static inline void cgroup_idle_start(struct sched_entity *se) { }
+#endif
 #endif /* _LINUX_CGROUP_H */
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -462,6 +462,10 @@ struct sched_entity {
 	u64				vruntime;
 	u64				prev_sum_exec_runtime;
+	u64				cg_idle_start;
+	u64				cg_idle_sum;
+	seqcount_t			idle_seqcount;
 	u64				nr_migrations;
 	struct sched_statistics		statistics;

--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -896,6 +896,7 @@ static void update_tasks_cpumask(struct cpuset *cs)
 static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 {
 	struct cpuset *cp;
+	struct cpumask added, deleted, old_cpus;
 	struct cgroup_subsys_state *pos_css;
 	bool need_rebuild_sched_domains = false;
@@ -912,6 +913,11 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 		if (is_in_v2_mode() && cpumask_empty(new_cpus))
 			cpumask_copy(new_cpus, parent->effective_cpus);
+		if (cpumask_empty(cp->effective_cpus))
+			cpumask_copy(&old_cpus, parent->effective_cpus);
+		else
+			cpumask_copy(&old_cpus, cp->effective_cpus);
 		/* Skip the whole subtree if the cpumask remains the same. */
 		if (cpumask_equal(new_cpus, cp->effective_cpus)) {
 			pos_css = css_rightmost_descendant(pos_css);
@@ -929,8 +935,16 @@ static void update_cpumasks_hier(struct cpuset *cs, struct cpumask *new_cpus)
 		WARN_ON(!is_in_v2_mode() &&
 			!cpumask_equal(cp->cpus_allowed, cp->effective_cpus));
+		/* add = new - old = new & (~old) */
+		cpumask_andnot(&added, new_cpus, &old_cpus);
+		cpuacct_cpuset_changed(cs->css.cgroup, NULL, &added);
 		update_tasks_cpumask(cp);
+		/* deleted = old - new = old & (~new) */
+		cpumask_andnot(&deleted, &old_cpus, new_cpus);
+		cpuacct_cpuset_changed(cs->css.cgroup, &deleted, NULL);
 		/*
 		 * If the effective cpumask of any non-empty cpuset is changed,
 		 * we need to rebuild sched domains.
@@ -2026,6 +2040,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
 	cs->effective_mems = parent->mems_allowed;
 	cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
 	cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
+	cpuacct_cpuset_changed(cs->css.cgroup, NULL, cs->effective_cpus);
 	spin_unlock_irq(&callback_lock);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);

--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -445,6 +445,74 @@ static unsigned long ca_uninterruptible(struct cpuacct *ca, int cpu)
 	return nr;
 }
+void cgroup_idle_start(struct sched_entity *se)
+{
+	u64 clock;
+	if (!schedstat_enabled())
+		return;
+	clock = __rq_clock_broken(se->cfs_rq->rq);
+	write_seqcount_begin(&se->idle_seqcount);
+	__schedstat_set(se->cg_idle_start, clock);
+	write_seqcount_end(&se->idle_seqcount);
+}
+void cgroup_idle_end(struct sched_entity *se)
+{
+	u64 clock;
+	u64 idle_start;
+	if (!schedstat_enabled())
+		return;
+	clock = __rq_clock_broken(se->cfs_rq->rq);
+	write_seqcount_begin(&se->idle_seqcount);
+	idle_start = schedstat_val(se->cg_idle_start);
+	__schedstat_add(se->cg_idle_sum, clock - idle_start);
+	__schedstat_set(se->cg_idle_start, 0);
+	write_seqcount_end(&se->idle_seqcount);
+}
+void cpuacct_cpuset_changed(struct cgroup *cgrp, struct cpumask *deleted,
+			struct cpumask *added)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+	int cpu;
+	if (!schedstat_enabled())
+		return;
+	rcu_read_lock();
+	tg = cgroup_tg(cgrp);
+	if (!tg) {
+		rcu_read_unlock();
+		return;
+	}
+	if (added) {
+		/* Mark newly added cpus as newly-idle */
+		for_each_cpu(cpu, added) {
+			se = tg->se[cpu];
+			cgroup_idle_start(se);
+		}
+	}
+	if (deleted) {
+		/* Mark ineffective_cpus as idle-invalid */
+		for_each_cpu(cpu, deleted) {
+			se = tg->se[cpu];
+			cgroup_idle_end(se);
+		}
+	}
+	rcu_read_unlock();
+}
 static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
 		struct task_group *tg, struct cpuacct_usage_result *res)
 {
@@ -484,10 +552,24 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
 	res->irq = kcpustat->cpustat[CPUTIME_IRQ];
 	res->softirq = kcpustat->cpustat[CPUTIME_SOFTIRQ];
-	if (se)
+	if (se && schedstat_enabled()) {
-		res->steal = se->statistics.wait_sum;
+		unsigned int seq;
-	else
+		u64 idle_start;
+		u64 clock = cpu_clock(cpu);
+		do {
+			seq = read_seqcount_begin(&se->idle_seqcount);
+			res->idle = schedstat_val(se->cg_idle_sum);
+			idle_start = schedstat_val(se->cg_idle_start);
+			clock = cpu_clock(cpu);
+			if (idle_start && clock > idle_start)
+				res->idle += clock - idle_start;
+		} while (read_seqcount_retry(&se->idle_seqcount, seq));
 		res->steal = 0;
+	} else {
+		res->idle = res->iowait = res->steal = 0;
+	}
 	res->guest = kcpustat->cpustat[CPUTIME_GUEST];
 	res->guest_nice = kcpustat->cpustat[CPUTIME_GUEST_NICE];
 }

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4465,9 +4465,11 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 		/* throttled entity or throttle-on-deactivate */
 		if (!se->on_rq)
 			break;
+		if (dequeue) {
-		if (dequeue)
+			if (se->my_q != cfs_rq)
+				cgroup_idle_start(se);
 			dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+		}
 		qcfs_rq->h_nr_running -= task_delta;
 		if (qcfs_rq->load.weight)
@@ -4504,6 +4506,7 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 {
+	struct cfs_rq *bottom_cfs_rq = cfs_rq;
 	struct rq *rq = rq_of(cfs_rq);
 	struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
 	struct sched_entity *se;
@@ -4533,8 +4536,11 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
 			enqueue = 0;
 		cfs_rq = cfs_rq_of(se);
-		if (enqueue)
+		if (enqueue) {
+			if (se->my_q != bottom_cfs_rq)
+				cgroup_idle_end(se);
 			enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+		}
 		cfs_rq->h_nr_running += task_delta;
 		if (cfs_rq_throttled(cfs_rq))
@@ -5132,14 +5138,22 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		enqueue_entity(cfs_rq, se, flags);
+		if (!entity_is_task(se))
+			cgroup_idle_end(se);
 		/*
 		 * end evaluation on encountering a throttled cfs_rq
 		 *
 		 * note: in the case of encountering a throttled cfs_rq we will
 		 * post the final h_nr_running increment below.
 		 */
-		if (cfs_rq_throttled(cfs_rq))
+		if (cfs_rq_throttled(cfs_rq)) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+			if (cfs_rq->nr_running == 1)
+				cgroup_idle_end(se->parent);
+#endif
 			break;
+		}
 		cfs_rq->h_nr_running++;
 		flags = ENQUEUE_WAKEUP;
@@ -5179,14 +5193,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
+		if (!entity_is_task(se))
+			cgroup_idle_start(se);
 		/*
 		 * end evaluation on encountering a throttled cfs_rq
 		 *
 		 * note: in the case of encountering a throttled cfs_rq we will
 		 * post the final h_nr_running decrement below.
 		*/
-		if (cfs_rq_throttled(cfs_rq))
+		if (cfs_rq_throttled(cfs_rq)) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+			if (!cfs_rq->nr_running)
+				cgroup_idle_start(se->parent);
+#endif
 			break;
+		}
 		cfs_rq->h_nr_running--;
 		/* Don't dequeue parent if it has other entities besides us */
@@ -10162,6 +10184,8 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	/* guarantee group entities always have weight */
 	update_load_set(&se->load, NICE_0_LOAD);
 	se->parent = parent;
+	seqcount_init(&se->idle_seqcount);
+	se->cg_idle_start = cpu_clock(cpu);
 }
 static DEFINE_MUTEX(shares_mutex);