loadavg.c 12.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0
2
/*
3
 * kernel/sched/loadavg.c
4
 *
5 6 7
 * This file contains the magic bits required to compute the global loadavg
 * figure. Its a silly number but people think its important. We go through
 * great pains to make it work on big machines and tickless kernels.
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 */
#include "sched.h"

/*
 * Global load-average calculations
 *
 * We take a distributed and async approach to calculating the global load-avg
 * in order to minimize overhead.
 *
 * The global load average is an exponentially decaying average of nr_running +
 * nr_uninterruptible.
 *
 * Once every LOAD_FREQ:
 *
 *   nr_active = 0;
 *   for_each_possible_cpu(cpu)
 *	nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible;
 *
 *   avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n)
 *
 * Due to a number of reasons the above turns in the mess below:
 *
 *  - for_each_possible_cpu() is prohibitively expensive on machines with
31
 *    serious number of CPUs, therefore we need to take a distributed approach
32 33 34 35 36 37
 *    to calculating nr_active.
 *
 *        \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0
 *                      = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) }
 *
 *    So assuming nr_active := 0 when we start out -- true per definition, we
38
 *    can simply take per-CPU deltas and fold those into a global accumulate
39 40
 *    to obtain the same result. See calc_load_fold_active().
 *
41
 *    Furthermore, in order to avoid synchronizing all per-CPU delta folding
42
 *    across the machine, we assume 10 ticks is sufficient time for every
43
 *    CPU to have completed this task.
44 45 46 47
 *
 *    This places an upper-bound on the IRQ-off latency of the machine. Then
 *    again, being late doesn't loose the delta, just wrecks the sample.
 *
48 49 50 51
 *  - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because
 *    this would add another cross-CPU cacheline miss and atomic operation
 *    to the wakeup path. Instead we increment on whatever CPU the task ran
 *    when it went into uninterruptible state and decrement on whatever CPU
52
 *    did the wakeup. This means that only the sum of nr_uninterruptible over
53
 *    all CPUs yields the correct result.
54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78
 *
 *  This covers the NO_HZ=n code, for extra head-aches, see the comment below.
 */

/* Variables and functions for calc_load */
atomic_long_t calc_load_tasks;
unsigned long calc_load_update;
unsigned long avenrun[3];
EXPORT_SYMBOL(avenrun); /* should be removed */

/**
 * get_avenrun - get the load average array
 * @loads:	pointer to dest load array
 * @offset:	offset to add
 * @shift:	shift count to shift the result left
 *
 * These values are estimates at best, so no need for locking.
 */
void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
{
	loads[0] = (avenrun[0] + offset) << shift;
	loads[1] = (avenrun[1] + offset) << shift;
	loads[2] = (avenrun[2] + offset) << shift;
}

79
long calc_load_fold_active(struct rq *this_rq, long adjust)
80 81 82
{
	long nr_active, delta = 0;

83
	nr_active = this_rq->nr_running - adjust;
84
	nr_active += (long)this_rq->nr_uninterruptible;
85 86 87 88 89 90 91 92 93

	if (nr_active != this_rq->calc_load_active) {
		delta = nr_active - this_rq->calc_load_active;
		this_rq->calc_load_active = nr_active;
	}

	return delta;
}

94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
/**
 * fixed_power_int - compute: x^n, in O(log n) time
 *
 * @x:         base of the power
 * @frac_bits: fractional bits of @x
 * @n:         power to raise @x to.
 *
 * By exploiting the relation between the definition of the natural power
 * function: x^n := x*x*...*x (x multiplied by itself for n times), and
 * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i,
 * (where: n_i \elem {0, 1}, the binary vector representing n),
 * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is
 * of course trivially computable in O(log_2 n), the length of our binary
 * vector.
 */
static unsigned long
fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n)
{
	unsigned long result = 1UL << frac_bits;

	if (n) {
		for (;;) {
			if (n & 1) {
				result *= x;
				result += 1UL << (frac_bits - 1);
				result >>= frac_bits;
			}
			n >>= 1;
			if (!n)
				break;
			x *= x;
			x += 1UL << (frac_bits - 1);
			x >>= frac_bits;
		}
	}

	return result;
}

/*
 * a1 = a0 * e + a * (1 - e)
 *
 * a2 = a1 * e + a * (1 - e)
 *    = (a0 * e + a * (1 - e)) * e + a * (1 - e)
 *    = a0 * e^2 + a * (1 - e) * (1 + e)
 *
 * a3 = a2 * e + a * (1 - e)
 *    = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e)
 *    = a0 * e^3 + a * (1 - e) * (1 + e + e^2)
 *
 *  ...
 *
 * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1]
 *    = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e)
 *    = a0 * e^n + a * (1 - e^n)
 *
 * [1] application of the geometric series:
 *
 *              n         1 - x^(n+1)
 *     S_n := \Sum x^i = -------------
 *             i=0          1 - x
 */
unsigned long
calc_load_n(unsigned long load, unsigned long exp,
	    unsigned long active, unsigned int n)
{
	return calc_load(load, fixed_power_int(exp, FSHIFT, n), active);
}

163 164 165 166 167
#ifdef CONFIG_NO_HZ_COMMON
/*
 * Handle NO_HZ for the global load-average.
 *
 * Since the above described distributed algorithm to compute the global
168
 * load-average relies on per-CPU sampling from the tick, it is affected by
169 170
 * NO_HZ.
 *
171
 * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon
172
 * entering NO_HZ state such that we can include this as an 'extra' CPU delta
173 174 175 176 177 178 179
 * when we read the global state.
 *
 * Obviously reality has to ruin such a delightfully simple scheme:
 *
 *  - When we go NO_HZ idle during the window, we can negate our sample
 *    contribution, causing under-accounting.
 *
180
 *    We avoid this by keeping two NO_HZ-delta counters and flipping them
181 182 183 184 185 186 187 188 189 190
 *    when the window starts, thus separating old and new NO_HZ load.
 *
 *    The only trick is the slight shift in index flip for read vs write.
 *
 *        0s            5s            10s           15s
 *          +10           +10           +10           +10
 *        |-|-----------|-|-----------|-|-----------|-|
 *    r:0 0 1           1 0           0 1           1 0
 *    w:0 1 1           0 0           1 1           0 0
 *
191
 *    This ensures we'll fold the old NO_HZ contribution in this window while
192 193
 *    accumlating the new one.
 *
194
 *  - When we wake up from NO_HZ during the window, we push up our
195 196 197 198
 *    contribution, since we effectively move our sample point to a known
 *    busy state.
 *
 *    This is solved by pushing the window forward, and thus skipping the
199
 *    sample, for this CPU (effectively using the NO_HZ-delta for this CPU which
200
 *    was in effect at the time the window opened). This also solves the issue
201
 *    of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ
202
 *    intervals.
203 204 205
 *
 * When making the ILB scale, we should try to pull this in as well.
 */
206
static atomic_long_t calc_load_nohz[2];
207 208 209 210 211 212 213 214 215 216 217 218 219 220
static int calc_load_idx;

static inline int calc_load_write_idx(void)
{
	int idx = calc_load_idx;

	/*
	 * See calc_global_nohz(), if we observe the new index, we also
	 * need to observe the new update time.
	 */
	smp_rmb();

	/*
	 * If the folding window started, make sure we start writing in the
221
	 * next NO_HZ-delta.
222
	 */
223
	if (!time_before(jiffies, READ_ONCE(calc_load_update)))
224 225 226 227 228 229 230 231 232 233
		idx++;

	return idx & 1;
}

static inline int calc_load_read_idx(void)
{
	return calc_load_idx & 1;
}

234
void calc_load_nohz_start(void)
235 236 237 238 239
{
	struct rq *this_rq = this_rq();
	long delta;

	/*
240 241
	 * We're going into NO_HZ mode, if there's any pending delta, fold it
	 * into the pending NO_HZ delta.
242
	 */
243
	delta = calc_load_fold_active(this_rq, 0);
244 245
	if (delta) {
		int idx = calc_load_write_idx();
246

247
		atomic_long_add(delta, &calc_load_nohz[idx]);
248 249 250
	}
}

251
void calc_load_nohz_stop(void)
252 253 254 255
{
	struct rq *this_rq = this_rq();

	/*
256
	 * If we're still before the pending sample window, we're done.
257
	 */
258
	this_rq->calc_load_update = READ_ONCE(calc_load_update);
259 260 261 262 263 264 265 266 267 268 269 270
	if (time_before(jiffies, this_rq->calc_load_update))
		return;

	/*
	 * We woke inside or after the sample window, this means we're already
	 * accounted through the nohz accounting, so skip the entire deal and
	 * sync up for the next window.
	 */
	if (time_before(jiffies, this_rq->calc_load_update + 10))
		this_rq->calc_load_update += LOAD_FREQ;
}

271
static long calc_load_nohz_fold(void)
272 273 274 275
{
	int idx = calc_load_read_idx();
	long delta = 0;

276 277
	if (atomic_long_read(&calc_load_nohz[idx]))
		delta = atomic_long_xchg(&calc_load_nohz[idx], 0);
278 279 280 281 282

	return delta;
}

/*
283
 * NO_HZ can leave us missing all per-CPU ticks calling
284 285 286
 * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into
 * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold
 * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary.
287 288 289 290 291 292
 *
 * Once we've updated the global active value, we need to apply the exponential
 * weights adjusted to the number of cycles missed.
 */
static void calc_global_nohz(void)
{
293
	unsigned long sample_window;
294 295
	long delta, active, n;

296 297
	sample_window = READ_ONCE(calc_load_update);
	if (!time_before(jiffies, sample_window + 10)) {
298 299 300
		/*
		 * Catch-up, fold however many we are behind still
		 */
301
		delta = jiffies - sample_window - 10;
302 303 304 305 306 307 308 309 310
		n = 1 + (delta / LOAD_FREQ);

		active = atomic_long_read(&calc_load_tasks);
		active = active > 0 ? active * FIXED_1 : 0;

		avenrun[0] = calc_load_n(avenrun[0], EXP_1, active, n);
		avenrun[1] = calc_load_n(avenrun[1], EXP_5, active, n);
		avenrun[2] = calc_load_n(avenrun[2], EXP_15, active, n);

311
		WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ);
312 313 314
	}

	/*
315
	 * Flip the NO_HZ index...
316 317 318 319 320 321 322 323 324 325
	 *
	 * Make sure we first write the new time then flip the index, so that
	 * calc_load_write_idx() will see the new time when it reads the new
	 * index, this avoids a double flip messing things up.
	 */
	smp_wmb();
	calc_load_idx++;
}
#else /* !CONFIG_NO_HZ_COMMON */

326
static inline long calc_load_nohz_fold(void) { return 0; }
327 328 329 330 331 332 333
static inline void calc_global_nohz(void) { }

#endif /* CONFIG_NO_HZ_COMMON */

/*
 * calc_load - update the avenrun load estimates 10 ticks after the
 * CPUs have updated calc_load_tasks.
334 335
 *
 * Called from the global timer code.
336 337 338
 */
void calc_global_load(unsigned long ticks)
{
339
	unsigned long sample_window;
340 341
	long active, delta;

342 343
	sample_window = READ_ONCE(calc_load_update);
	if (time_before(jiffies, sample_window + 10))
344 345 346
		return;

	/*
347
	 * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs.
348
	 */
349
	delta = calc_load_nohz_fold();
350 351 352 353 354 355 356 357 358 359
	if (delta)
		atomic_long_add(delta, &calc_load_tasks);

	active = atomic_long_read(&calc_load_tasks);
	active = active > 0 ? active * FIXED_1 : 0;

	avenrun[0] = calc_load(avenrun[0], EXP_1, active);
	avenrun[1] = calc_load(avenrun[1], EXP_5, active);
	avenrun[2] = calc_load(avenrun[2], EXP_15, active);

360
	WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ);
361 362

	/*
363 364
	 * In case we went to NO_HZ for multiple LOAD_FREQ intervals
	 * catch up in bulk.
365 366 367 368 369
	 */
	calc_global_nohz();
}

/*
370
 * Called from scheduler_tick() to periodically update this CPU's
371 372
 * active count.
 */
373
void calc_global_load_tick(struct rq *this_rq)
374 375 376 377 378 379
{
	long delta;

	if (time_before(jiffies, this_rq->calc_load_update))
		return;

380
	delta  = calc_load_fold_active(this_rq, 0);
381 382 383 384 385
	if (delta)
		atomic_long_add(delta, &calc_load_tasks);

	this_rq->calc_load_update += LOAD_FREQ;
}
386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450

static void calc_stress_avgs_work(struct work_struct *work);
#define STRESS_FREQ (5*HZ+1)
u64 stress_avg_total[3];
static u64 stress_avg_next;
static u64 stress_avg_last;
static u64 stress_period __read_mostly;
DECLARE_DELAYED_WORK(stress_delayed_work, calc_stress_avgs_work);

static void calc_avgs(u64 avg[3], int missed_periods, u64 stress, u64 period)
{
	unsigned long pct;

	/* Fill in zeroes for periods of no activity */
	if (missed_periods) {
		avg[0] = calc_load_n(avg[0], EXP_1, 0, missed_periods);
		avg[1] = calc_load_n(avg[1], EXP_5, 0, missed_periods);
		avg[2] = calc_load_n(avg[2], EXP_15, 0, missed_periods);
	}

	/* Sample the most recent active period */
	if (period == 0)
		period = 1;
	pct = div64_u64(stress, period);
	pct *= FIXED_1;
	avg[0] = calc_load(avg[0], EXP_1, pct);
	avg[1] = calc_load(avg[1], EXP_5, pct);
	avg[2] = calc_load(avg[2], EXP_15, pct);
}

static void calc_stress_avgs_work(struct work_struct *work)
{
	int cpu;
	struct rq *rq;
	unsigned long delay_ticks;
	u64 now, stress, period, missed_periods = 0, stress_total = 0;

	now = sched_clock();

	if (now - stress_avg_next >= stress_period)
		missed_periods = div64_u64(now - stress_avg_next,
						stress_period);

	period = now - (stress_avg_last + (missed_periods * stress_period));
	stress_avg_last = now;

	for_each_possible_cpu(cpu) {
		rq = cpu_rq(cpu);

		stress = atomic64_xchg(&rq->cpu_stress, 0);
		stress_total += stress;
	}

	calc_avgs(stress_avg_total, missed_periods, stress_total, period);

	stress_avg_next += (1 + missed_periods) * stress_period;
	delay_ticks = nsecs_to_jiffies(stress_avg_next - now) + 1;
	schedule_delayed_work(&stress_delayed_work, delay_ticks);
}

void schedule_stress(void)
{
	stress_period = jiffies_to_nsecs(STRESS_FREQ);
	schedule_delayed_work(&stress_delayed_work, STRESS_FREQ);
}