From 9b44ae617aa6d599becef9387af03579dc508744 Mon Sep 17 00:00:00 2001 From: Huaixin Chang Date: Mon, 23 Dec 2019 19:40:35 +0800 Subject: [PATCH] sched/fair: use static load in wake_affine_weight For a long time runnable cpu load has been used in selecting task rq when waking up tasks. Recent test has shown for test load with a large quantity of short running tasks and almost full cpu utility, static load is more helpful. In our e2e tests, runnable load avg of java threads ranges from less than 10 to as large as 362, while these java threads are no different from each other, and should be treated in the same way. After using static load, qps imporvement has been seen in multiple test cases. A new sched feature WA_STATIC_WEIGHT is introduced here to control. Echo WA_STATIC_WEIGHT to /sys/kernel/debug/sched_features to turn static load in wake_affine_weight on and NO_WA_STATIC_WEIGHT to turn it off. This feature is kept off by default. Test is done on the following hardware: 4 threads Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz In tests with 120 threads and sql loglevel configured to info: NO_WA_STATIC_WEIGHT WA_STATIC_WEIGHT 33170.63 34614.95 (+4.35%) In tests with 160 threads and sql loglevel configured to info: NO_WA_STATIC_WEIGHT WA_STATIC_WEIGHT 35888.71 38247.20 (+6.57%) In tests with 160 threads and sql loglevel configured to warn: NO_WA_STATIC_WEIGHT WA_STATIC_WEIGHT 39118.72 39698.72 (+1.48%) Signed-off-by: Huaixin Chang Acked-by: Shanpei Chen --- kernel/sched/fair.c | 72 ++++++++++++++++++++++++++++++++++++++--- kernel/sched/features.h | 1 + 2 files changed, 69 insertions(+), 4 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 784f6b1b9bee..1ffcbaccfde5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -692,6 +692,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se) static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu); static unsigned long task_h_load(struct task_struct *p); +static unsigned long task_h_load_static(struct task_struct *p); /* Give new sched_entity start runnable values to heavy its load in infant time */ void init_entity_runnable_average(struct sched_entity *se) @@ -5645,10 +5646,19 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, s64 this_eff_load, prev_eff_load; unsigned long task_load; - this_eff_load = target_load(this_cpu, sd->wake_idx); + if (sched_feat(WA_STATIC_WEIGHT)) + this_eff_load = + scale_load_down(cpu_rq(this_cpu)->cfs.load.weight); + else + this_eff_load = target_load(this_cpu, sd->wake_idx); if (sync) { - unsigned long current_load = task_h_load(current); + unsigned long current_load; + + if (sched_feat(WA_STATIC_WEIGHT)) + current_load = task_h_load_static(current); + else + current_load = task_h_load(current); if (current_load > this_eff_load) return this_cpu; @@ -5656,14 +5666,21 @@ wake_affine_weight(struct sched_domain *sd, struct task_struct *p, this_eff_load -= current_load; } - task_load = task_h_load(p); + if (sched_feat(WA_STATIC_WEIGHT)) + task_load = task_h_load_static(p); + else + task_load = task_h_load(p); this_eff_load += task_load; if (sched_feat(WA_BIAS)) this_eff_load *= 100; this_eff_load *= capacity_of(prev_cpu); - prev_eff_load = source_load(prev_cpu, sd->wake_idx); + if (sched_feat(WA_STATIC_WEIGHT)) + prev_eff_load = + scale_load_down(cpu_rq(prev_cpu)->cfs.load.weight); + else + prev_eff_load = source_load(prev_cpu, sd->wake_idx); prev_eff_load -= task_load; if (sched_feat(WA_BIAS)) prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2; @@ -7498,6 +7515,48 @@ static unsigned long task_h_load(struct task_struct *p) return div64_ul(p->se.avg.load_avg * cfs_rq->h_load, cfs_rq_load_avg(cfs_rq) + 1); } + +static void update_cfs_rq_h_load_static(struct cfs_rq *cfs_rq) +{ + struct rq *rq = rq_of(cfs_rq); + struct sched_entity *se = cfs_rq->tg->se[cpu_of(rq)]; + unsigned long now = jiffies; + unsigned long load; + + if (cfs_rq->last_h_load_update == now) + return; + + WRITE_ONCE(cfs_rq->h_load_next, NULL); + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); + WRITE_ONCE(cfs_rq->h_load_next, se); + if (cfs_rq->last_h_load_update == now) + break; + } + + if (!se) { + cfs_rq->h_load = scale_load_down(cfs_rq->load.weight); + cfs_rq->last_h_load_update = now; + } + + while ((se = READ_ONCE(cfs_rq->h_load_next)) != NULL) { + load = cfs_rq->h_load; + load = div64_ul(load * se->load.weight, + cfs_rq->load.weight + 1); + cfs_rq = group_cfs_rq(se); + cfs_rq->h_load = load; + cfs_rq->last_h_load_update = now; + } +} + +static unsigned long task_h_load_static(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + update_cfs_rq_h_load_static(cfs_rq); + return div64_ul(p->se.load.weight * cfs_rq->h_load, + cfs_rq->load.weight + 1); +} #else static inline void update_blocked_averages(int cpu) { @@ -7526,6 +7585,11 @@ static unsigned long task_h_load(struct task_struct *p) { return p->se.avg.load_avg; } + +static unsigned long task_h_load_static(struct task_struct *p) +{ + return scale_load_down(p->se.load.weight); +} #endif /********** Helpers for find_busiest_group ************************/ diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 85ae8488039c..6de8eff23e3b 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -85,6 +85,7 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true) SCHED_FEAT(WA_IDLE, true) SCHED_FEAT(WA_WEIGHT, true) SCHED_FEAT(WA_BIAS, true) +SCHED_FEAT(WA_STATIC_WEIGHT, false) /* * UtilEstimation. Use estimated CPU utilization. -- GitLab