From 9b44ae617aa6d599becef9387af03579dc508744 Mon Sep 17 00:00:00 2001
From: Huaixin Chang <changhuaixin@linux.alibaba.com>
Date: Mon, 23 Dec 2019 19:40:35 +0800
Subject: [PATCH] sched/fair: use static load in wake_affine_weight

For a long time runnable cpu load has been used in selecting task rq
when waking up tasks. Recent test has shown for test load with a large
quantity of short running tasks and almost full cpu utility, static load
is more helpful.

In our e2e tests, runnable load avg of java threads ranges from less than
10 to as large as 362, while these java threads are no different from
each other, and should be treated in the same way. After using static
load, qps imporvement has been seen in multiple test cases.

A new sched feature WA_STATIC_WEIGHT is introduced here to control. Echo
WA_STATIC_WEIGHT to /sys/kernel/debug/sched_features to turn static load
in wake_affine_weight on and NO_WA_STATIC_WEIGHT to turn it off. This
feature is kept off by default.

Test is done on the following hardware:

4 threads Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz

In tests with 120 threads and sql loglevel configured to info:

	NO_WA_STATIC_WEIGHT     WA_STATIC_WEIGHT
	33170.63                34614.95 (+4.35%)

In tests with 160 threads and sql loglevel configured to info:

	NO_WA_STATIC_WEIGHT     WA_STATIC_WEIGHT
	35888.71                38247.20 (+6.57%)

In tests with 160 threads and sql loglevel configured to warn:

	NO_WA_STATIC_WEIGHT     WA_STATIC_WEIGHT
	39118.72                39698.72 (+1.48%)

Signed-off-by: Huaixin Chang <changhuaixin@linux.alibaba.com>
Acked-by: Shanpei Chen <shanpeic@linux.alibaba.com>
---
 kernel/sched/fair.c     | 72 ++++++++++++++++++++++++++++++++++++++---
 kernel/sched/features.h |  1 +
 2 files changed, 69 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 784f6b1b9bee..1ffcbaccfde5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -692,6 +692,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
+static unsigned long task_h_load_static(struct task_struct *p);
 
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -5645,10 +5646,19 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 	s64 this_eff_load, prev_eff_load;
 	unsigned long task_load;
 
-	this_eff_load = target_load(this_cpu, sd->wake_idx);
+	if (sched_feat(WA_STATIC_WEIGHT))
+		this_eff_load =
+			scale_load_down(cpu_rq(this_cpu)->cfs.load.weight);
+	else
+		this_eff_load = target_load(this_cpu, sd->wake_idx);
 
 	if (sync) {
-		unsigned long current_load = task_h_load(current);
+		unsigned long current_load;
+
+		if (sched_feat(WA_STATIC_WEIGHT))
+			current_load = task_h_load_static(current);
+		else
+			current_load = task_h_load(current);
 
 		if (current_load > this_eff_load)
 			return this_cpu;
@@ -5656,14 +5666,21 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
 		this_eff_load -= current_load;
 	}
 
-	task_load = task_h_load(p);
+	if (sched_feat(WA_STATIC_WEIGHT))
+		task_load = task_h_load_static(p);
+	else
+		task_load = task_h_load(p);
 
 	this_eff_load += task_load;
 	if (sched_feat(WA_BIAS))
 		this_eff_load *= 100;
 	this_eff_load *= capacity_of(prev_cpu);
 
-	prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+	if (sched_feat(WA_STATIC_WEIGHT))
+		prev_eff_load =
+			scale_load_down(cpu_rq(prev_cpu)->cfs.load.weight);
+	else
+		prev_eff_load = source_load(prev_cpu, sd->wake_idx);
 	prev_eff_load -= task_load;
 	if (sched_feat(WA_BIAS))
 		prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
@@ -7498,6 +7515,48 @@ static unsigned long task_h_load(struct task_struct *p)
 	return div64_ul(p->se.avg.load_avg * cfs_rq->h_load,
 			cfs_rq_load_avg(cfs_rq) + 1);
 }
+
+static void update_cfs_rq_h_load_static(struct cfs_rq *cfs_rq)
+{
+	struct rq *rq = rq_of(cfs_rq);
+	struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)];
+	unsigned long now = jiffies;
+	unsigned long load;
+
+	if (cfs_rq->last_h_load_update == now)
+		return;
+
+	WRITE_ONCE(cfs_rq->h_load_next, NULL);
+	for_each_sched_entity(se) {
+		cfs_rq = cfs_rq_of(se);
+		WRITE_ONCE(cfs_rq->h_load_next, se);
+		if (cfs_rq->last_h_load_update == now)
+			break;
+	}
+
+	if (!se) {
+		cfs_rq->h_load = scale_load_down(cfs_rq->load.weight);
+		cfs_rq->last_h_load_update = now;
+	}
+
+	while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) {
+		load = cfs_rq->h_load;
+		load = div64_ul(load * se->load.weight,
+			cfs_rq->load.weight + 1);
+		cfs_rq = group_cfs_rq(se);
+		cfs_rq->h_load = load;
+		cfs_rq->last_h_load_update = now;
+	}
+}
+
+static unsigned long task_h_load_static(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+
+	update_cfs_rq_h_load_static(cfs_rq);
+	return div64_ul(p->se.load.weight * cfs_rq->h_load,
+			cfs_rq->load.weight + 1);
+}
 #else
 static inline void update_blocked_averages(int cpu)
 {
@@ -7526,6 +7585,11 @@ static unsigned long task_h_load(struct task_struct *p)
 {
 	return p->se.avg.load_avg;
 }
+
+static unsigned long task_h_load_static(struct task_struct *p)
+{
+	return scale_load_down(p->se.load.weight);
+}
 #endif
 
 /********** Helpers for find_busiest_group ************************/
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 85ae8488039c..6de8eff23e3b 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,6 +85,7 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
 SCHED_FEAT(WA_IDLE, true)
 SCHED_FEAT(WA_WEIGHT, true)
 SCHED_FEAT(WA_BIAS, true)
+SCHED_FEAT(WA_STATIC_WEIGHT, false)
 
 /*
  * UtilEstimation. Use estimated CPU utilization.
-- 
GitLab