alinux: sched: Introduce per-cgroup steal accounting

to #26424323 From the previous patch. We know there are 4 possible states. Since steal state's transition is complex. We choose to account its supplement. steal = elapse - idle - sum_exec_raw - ineffective Where elapse is the time since the cgroup is created. sum_exec_raw is the running time including IRQ time. ineffective is the total time that the cpuacct-binded cpuset doesn't allow this cpu for the cgroup. Signed-off-by: N Yihao Wu <wuyihao@linux.alibaba.com> Signed-off-by: N Shanpei Chen <shanpeic@linux.alibaba.com> Acked-by: N Michael Wang <yun.wang@linux.alibaba.com>

alinux: sched: Introduce per-cgroup steal accounting
to #26424323 From the previous patch. We know there are 4 possible states. Since steal state's transition is complex. We choose to account its supplement. steal = elapse - idle - sum_exec_raw - ineffective Where elapse is the time since the cgroup is created. sum_exec_raw is the running time including IRQ time. ineffective is the total time that the cpuacct-binded cpuset doesn't allow this cpu for the cgroup. Signed-off-by: N Yihao Wu <wuyihao@linux.alibaba.com> Signed-off-by: N Shanpei Chen <shanpeic@linux.alibaba.com> Acked-by: N Michael Wang <yun.wang@linux.alibaba.com>
c7552980 · Yihao Wu · 61e58859 · c7552980 · c7552980 · c7552980
隐藏空白更改
内联并排

Showing with 38 addition and 3 deletion

include/linux/sched.h include/linux/sched.h +6 -0

kernel/sched/cpuacct.c kernel/sched/cpuacct.c +20 -2

kernel/sched/fair.c kernel/sched/fair.c +12 -1

未找到文件。
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -462,8 +462,14 @@ struct sched_entity {
 	u64				vruntime;
 	u64				prev_sum_exec_runtime;

+	/* irq time is included */
+	u64				exec_start_raw;
+	u64				sum_exec_raw;
 	u64				cg_idle_start;
 	u64				cg_idle_sum;
+	u64				cg_init_time;
+	u64				cg_ineffective_sum;
+	u64				cg_ineffective_start;
 	seqcount_t			idle_seqcount;

 	u64				nr_migrations;

--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -499,6 +499,10 @@ void cpuacct_cpuset_changed(struct cgroup *cgrp, struct cpumask *deleted,
 		for_each_cpu(cpu, added) {
 			se = tg->se[cpu];
 			cgroup_idle_start(se);
+			__schedstat_add(se->cg_ineffective_sum,
+				__rq_clock_broken(cpu_rq(cpu)) -
+					se->cg_ineffective_start);
+			__schedstat_set(se->cg_ineffective_start, 0);
 		}
 	}

@@ -507,6 +511,9 @@ void cpuacct_cpuset_changed(struct cgroup *cgrp, struct cpumask *deleted,
 		for_each_cpu(cpu, deleted) {
 			se = tg->se[cpu];
 			cgroup_idle_end(se);
+			/* Use __rq_clock_broken to avoid warning */
+			__schedstat_set(se->cg_ineffective_start,
+				__rq_clock_broken(cpu_rq(cpu)));
 		}
 	}

@@ -554,8 +561,8 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
 	res->softirq = kcpustat->cpustat[CPUTIME_SOFTIRQ];
 	if (se && schedstat_enabled()) {
 		unsigned int seq;
-		u64 idle_start;
-		u64 clock = cpu_clock(cpu);
+		u64 idle_start, ineff, ineff_start, elapse, complement;
+		u64 clock;

 		do {
 			seq = read_seqcount_begin(&se->idle_seqcount);
@@ -566,7 +573,18 @@ static void __cpuacct_get_usage_result(struct cpuacct *ca, int cpu,
 				res->idle += clock - idle_start;
 		} while (read_seqcount_retry(&se->idle_seqcount, seq));

+		ineff = schedstat_val(se->cg_ineffective_sum);
+		ineff_start = schedstat_val(se->cg_ineffective_start);
+		if (ineff_start)
+			__schedstat_add(ineff, clock - ineff_start);
+
 		res->steal = 0;
+
+		elapse = clock - schedstat_val(se->cg_init_time);
+		complement = res->idle + se->sum_exec_raw + ineff;
+		if (elapse > complement)
+			res->steal = elapse - complement;
+
 	} else {
 		res->idle = res->iowait = res->steal = 0;
 	}

--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -796,6 +796,15 @@ static void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
 }
 #endif /* CONFIG_SMP */

+static inline void
+update_exec_raw(struct cfs_rq *cfs_rq, struct sched_entity *curr)
+{
+	u64 now = rq_clock(rq_of(cfs_rq));
+
+	curr->sum_exec_raw += now - curr->exec_start_raw;
+	curr->exec_start_raw = now;
+}
+
 /*
 * Update the current task's runtime statistics.
 */
@@ -832,6 +841,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	}

 	account_cfs_rq_runtime(cfs_rq, delta_exec);
+	update_exec_raw(cfs_rq, curr);
 }

 static void update_curr_fair(struct rq *rq)
@@ -1013,6 +1023,7 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	 * We are starting a new run period:
 	 */
 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
+	se->exec_start_raw = rq_clock_task(rq_of(cfs_rq));
 }

 /**************************************************
@@ -10185,7 +10196,7 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	update_load_set(&se->load, NICE_0_LOAD);
 	se->parent = parent;
 	seqcount_init(&se->idle_seqcount);
-	se->cg_idle_start = cpu_clock(cpu);
+	se->cg_idle_start = se->cg_init_time = cpu_clock(cpu);
 }

 static DEFINE_MUTEX(shares_mutex);