From 70a23044f6b4778e770ab7e420abfe6e34d018ad Mon Sep 17 00:00:00 2001 From: Huaixin Chang Date: Tue, 24 Mar 2020 11:34:53 +0800 Subject: [PATCH] sched/fair: Fix race between runtime distribution and assignment fix #25892693 commit 26a8b12747c975b33b4a82d62e4a307e1c07f31b upstream Currently, there is a potential race between distribute_cfs_runtime() and assign_cfs_rq_runtime(). Race happens when cfs_b->runtime is read, distributes without holding lock and finds out there is not enough runtime to charge against after distribution. Because assign_cfs_rq_runtime() might be called during distribution, and use cfs_b->runtime at the same time. Fibtest is the tool to test this race. Assume all gcfs_rq is throttled and cfs period timer runs, slow threads might run and sleep, returning unused cfs_rq runtime and keeping min_cfs_rq_runtime in their local pool. If all this happens sufficiently quickly, cfs_b->runtime will drop a lot. If runtime distributed is large too, over-use of runtime happens. A runtime over-using by about 70 percent of quota is seen when we test fibtest on a 96-core machine. We run fibtest with 1 fast thread and 95 slow threads in test group, configure 10ms quota for this group and see the CPU usage of fibtest is 17.0%, which is far from than the expected 10%. On a smaller machine with 32 cores, we also run fibtest with 96 threads. CPU usage is more than 12%, which is also more than expected 10%. This shows that on similar workloads, this race do affect CPU bandwidth control. Solve this by holding lock inside distribute_cfs_runtime(). Fixes: c06f04c70489 ("sched: Fix potential near-infinite distribute_cfs_runtime() loop") Signed-off-by: Huaixin Chang Reviewed-by: Ben Segall Reviewed-by: Xunlei Pang Link: https://lore.kernel.org/lkml/20200325092602.22471-1-changhuaixin@linux.alibaba.com/ Signed-off-by: Yihao Wu --- kernel/sched/fair.c | 31 +++++++++++-------------------- 1 file changed, 11 insertions(+), 20 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b4dd16e6907..6e12d6ad3481 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4549,11 +4549,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) resched_curr(rq); } -static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) +static void distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { struct cfs_rq *cfs_rq; - u64 runtime; - u64 starting_runtime = remaining; + u64 runtime, remaining = 1; rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -4568,10 +4567,13 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) /* By the above check, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); + raw_spin_lock(&cfs_b->lock); runtime = -cfs_rq->runtime_remaining + 1; - if (runtime > remaining) - runtime = remaining; - remaining -= runtime; + if (runtime > cfs_b->runtime) + runtime = cfs_b->runtime; + cfs_b->runtime -= runtime; + remaining = cfs_b->runtime; + raw_spin_unlock(&cfs_b->lock); cfs_rq->runtime_remaining += runtime; @@ -4586,8 +4588,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) break; } rcu_read_unlock(); - - return starting_runtime - remaining; } /* @@ -4598,7 +4598,6 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, u64 remaining) */ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) { - u64 runtime; int throttled; /* no need to continue the timer with no bandwidth constraint */ @@ -4627,24 +4626,17 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) cfs_b->nr_throttled += overrun; /* - * This check is repeated as we are holding onto the new bandwidth while - * we unthrottle. This can potentially race with an unthrottled group - * trying to acquire new bandwidth from the global pool. This can result - * in us over-using our runtime if it is all used during this loop, but - * only by limited amounts in that extreme case. + * This check is repeated as we release cfs_b->lock while we unthrottle. */ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { - runtime = cfs_b->runtime; cfs_b->distribute_running = 1; raw_spin_unlock(&cfs_b->lock); /* we can't nest cfs_b->lock while distributing bandwidth */ - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock(&cfs_b->lock); cfs_b->distribute_running = 0; throttled = !list_empty(&cfs_b->throttled_cfs_rq); - - cfs_b->runtime -= min(runtime, cfs_b->runtime); } /* @@ -4777,10 +4769,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (!runtime) return; - runtime = distribute_cfs_runtime(cfs_b, runtime); + distribute_cfs_runtime(cfs_b); raw_spin_lock(&cfs_b->lock); - cfs_b->runtime -= min(runtime, cfs_b->runtime); cfs_b->distribute_running = 0; raw_spin_unlock(&cfs_b->lock); } -- GitLab