sch_pie.c 15.1 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2 3 4 5 6 7 8
/* Copyright (C) 2013 Cisco Systems, Inc, 2013.
 *
 * Author: Vijay Subramanian <vijaynsu@cisco.com>
 * Author: Mythili Prabhu <mysuryan@cisco.com>
 *
 * ECN support is added by Naeem Khademi <naeemk@ifi.uio.no>
 * University of Oslo, Norway.
9 10
 *
 * References:
11
 * RFC 8033: https://tools.ietf.org/html/rfc8033
12 13 14 15 16 17 18 19 20 21
 */

#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <net/pkt_sched.h>
#include <net/inet_ecn.h>
22
#include <net/pie.h>
23 24 25 26

/* private data for the Qdisc */
struct pie_sched_data {
	struct pie_vars vars;
27
	struct pie_params params;
28 29
	struct pie_stats stats;
	struct timer_list adapt_timer;
30
	struct Qdisc *sch;
31 32
};

33
bool pie_drop_early(struct Qdisc *sch, struct pie_params *params,
34
		    struct pie_vars *vars, u32 backlog, u32 packet_size)
35
{
36
	u64 rnd;
37
	u64 local_prob = vars->prob;
38 39 40
	u32 mtu = psched_mtu(qdisc_dev(sch));

	/* If there is still burst allowance left skip random early drop */
41
	if (vars->burst_time > 0)
42 43 44 45 46
		return false;

	/* If current delay is less than half of target, and
	 * if drop prob is low already, disable early_drop
	 */
47 48
	if ((vars->qdelay < params->target / 2) &&
	    (vars->prob < MAX_PROB / 5))
49 50
		return false;

51
	/* If we have fewer than 2 mtu-sized packets, disable pie_drop_early,
52 53
	 * similar to min_th in RED
	 */
54
	if (backlog < 2 * mtu)
55 56 57 58 59
		return false;

	/* If bytemode is turned on, use packet size to compute new
	 * probablity. Smaller packets will have lower drop prob in this case
	 */
60
	if (params->bytemode && packet_size <= mtu)
61
		local_prob = (u64)packet_size * div_u64(local_prob, mtu);
62
	else
63
		local_prob = vars->prob;
64

65
	if (local_prob == 0) {
66 67
		vars->accu_prob = 0;
		vars->accu_prob_overflows = 0;
68 69
	}

70 71
	if (local_prob > MAX_PROB - vars->accu_prob)
		vars->accu_prob_overflows++;
72

73
	vars->accu_prob += local_prob;
74

75 76
	if (vars->accu_prob_overflows == 0 &&
	    vars->accu_prob < (MAX_PROB / 100) * 85)
77
		return false;
78 79
	if (vars->accu_prob_overflows == 8 &&
	    vars->accu_prob >= MAX_PROB / 2)
80 81
		return true;

82
	prandom_bytes(&rnd, 8);
83
	if (rnd < local_prob) {
84 85
		vars->accu_prob = 0;
		vars->accu_prob_overflows = 0;
86
		return true;
87
	}
88 89 90

	return false;
}
91
EXPORT_SYMBOL_GPL(pie_drop_early);
92

93 94
static int pie_qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch,
			     struct sk_buff **to_free)
95 96 97 98 99 100 101 102 103
{
	struct pie_sched_data *q = qdisc_priv(sch);
	bool enqueue = false;

	if (unlikely(qdisc_qlen(sch) >= sch->limit)) {
		q->stats.overlimit++;
		goto out;
	}

104 105
	if (!pie_drop_early(sch, &q->params, &q->vars, sch->qstats.backlog,
			    skb->len)) {
106 107 108 109 110 111 112 113 114 115 116 117
		enqueue = true;
	} else if (q->params.ecn && (q->vars.prob <= MAX_PROB / 10) &&
		   INET_ECN_set_ce(skb)) {
		/* If packet is ecn capable, mark it if drop probability
		 * is lower than 10%, else drop it.
		 */
		q->stats.ecn_mark++;
		enqueue = true;
	}

	/* we can enqueue the packet */
	if (enqueue) {
118 119 120 121
		/* Set enqueue time only when dq_rate_estimator is disabled. */
		if (!q->params.dq_rate_estimator)
			pie_set_enqueue_time(skb);

122 123 124 125 126 127 128 129 130
		q->stats.packets_in++;
		if (qdisc_qlen(sch) > q->stats.maxq)
			q->stats.maxq = qdisc_qlen(sch);

		return qdisc_enqueue_tail(skb, sch);
	}

out:
	q->stats.dropped++;
131 132
	q->vars.accu_prob = 0;
	q->vars.accu_prob_overflows = 0;
133
	return qdisc_drop(skb, sch, to_free);
134 135 136
}

static const struct nla_policy pie_policy[TCA_PIE_MAX + 1] = {
137 138 139 140 141 142 143 144
	[TCA_PIE_TARGET]		= {.type = NLA_U32},
	[TCA_PIE_LIMIT]			= {.type = NLA_U32},
	[TCA_PIE_TUPDATE]		= {.type = NLA_U32},
	[TCA_PIE_ALPHA]			= {.type = NLA_U32},
	[TCA_PIE_BETA]			= {.type = NLA_U32},
	[TCA_PIE_ECN]			= {.type = NLA_U32},
	[TCA_PIE_BYTEMODE]		= {.type = NLA_U32},
	[TCA_PIE_DQ_RATE_ESTIMATOR]	= {.type = NLA_U32},
145 146
};

147 148
static int pie_change(struct Qdisc *sch, struct nlattr *opt,
		      struct netlink_ext_ack *extack)
149 150 151
{
	struct pie_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_PIE_MAX + 1];
152
	unsigned int qlen, dropped = 0;
153 154 155 156 157
	int err;

	if (!opt)
		return -EINVAL;

158 159
	err = nla_parse_nested_deprecated(tb, TCA_PIE_MAX, opt, pie_policy,
					  NULL);
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175
	if (err < 0)
		return err;

	sch_tree_lock(sch);

	/* convert from microseconds to pschedtime */
	if (tb[TCA_PIE_TARGET]) {
		/* target is in us */
		u32 target = nla_get_u32(tb[TCA_PIE_TARGET]);

		/* convert to pschedtime */
		q->params.target = PSCHED_NS2TICKS((u64)target * NSEC_PER_USEC);
	}

	/* tupdate is in jiffies */
	if (tb[TCA_PIE_TUPDATE])
176 177
		q->params.tupdate =
			usecs_to_jiffies(nla_get_u32(tb[TCA_PIE_TUPDATE]));
178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197

	if (tb[TCA_PIE_LIMIT]) {
		u32 limit = nla_get_u32(tb[TCA_PIE_LIMIT]);

		q->params.limit = limit;
		sch->limit = limit;
	}

	if (tb[TCA_PIE_ALPHA])
		q->params.alpha = nla_get_u32(tb[TCA_PIE_ALPHA]);

	if (tb[TCA_PIE_BETA])
		q->params.beta = nla_get_u32(tb[TCA_PIE_BETA]);

	if (tb[TCA_PIE_ECN])
		q->params.ecn = nla_get_u32(tb[TCA_PIE_ECN]);

	if (tb[TCA_PIE_BYTEMODE])
		q->params.bytemode = nla_get_u32(tb[TCA_PIE_BYTEMODE]);

198 199 200 201
	if (tb[TCA_PIE_DQ_RATE_ESTIMATOR])
		q->params.dq_rate_estimator =
				nla_get_u32(tb[TCA_PIE_DQ_RATE_ESTIMATOR]);

202 203 204
	/* Drop excess packets if new limit is lower */
	qlen = sch->q.qlen;
	while (sch->q.qlen > sch->limit) {
205
		struct sk_buff *skb = __qdisc_dequeue_head(&sch->q);
206

207
		dropped += qdisc_pkt_len(skb);
208
		qdisc_qstats_backlog_dec(sch, skb);
209
		rtnl_qdisc_drop(skb, sch);
210
	}
211
	qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
212 213 214 215 216

	sch_tree_unlock(sch);
	return 0;
}

217
void pie_process_dequeue(struct sk_buff *skb, struct pie_params *params,
218
			 struct pie_vars *vars, u32 backlog)
219
{
220 221 222 223 224 225
	psched_time_t now = psched_get_time();
	u32 dtime = 0;

	/* If dq_rate_estimator is disabled, calculate qdelay using the
	 * packet timestamp.
	 */
226 227
	if (!params->dq_rate_estimator) {
		vars->qdelay = now - pie_get_enqueue_time(skb);
228

229 230
		if (vars->dq_tstamp != DTIME_INVALID)
			dtime = now - vars->dq_tstamp;
231

232
		vars->dq_tstamp = now;
233

234
		if (backlog == 0)
235
			vars->qdelay = 0;
236 237 238 239 240 241

		if (dtime == 0)
			return;

		goto burst_allowance_reduction;
	}
242 243 244 245 246

	/* If current queue is about 10 packets or more and dq_count is unset
	 * we have enough packets to calculate the drain rate. Save
	 * current time as dq_tstamp and start measurement cycle.
	 */
247
	if (backlog >= QUEUE_THRESHOLD && vars->dq_count == DQCOUNT_INVALID) {
248 249
		vars->dq_tstamp = psched_get_time();
		vars->dq_count = 0;
250 251
	}

252 253
	/* Calculate the average drain rate from this value. If queue length
	 * has receded to a small value viz., <= QUEUE_THRESHOLD bytes, reset
254
	 * the dq_count to -1 as we don't have enough packets to calculate the
255
	 * drain rate anymore. The following if block is entered only when we
256 257 258 259 260
	 * have a substantial queue built up (QUEUE_THRESHOLD bytes or more)
	 * and we calculate the drain rate for the threshold here.  dq_count is
	 * in bytes, time difference in psched_time, hence rate is in
	 * bytes/psched_time.
	 */
261 262
	if (vars->dq_count != DQCOUNT_INVALID) {
		vars->dq_count += skb->len;
263

264 265
		if (vars->dq_count >= QUEUE_THRESHOLD) {
			u32 count = vars->dq_count << PIE_SCALE;
266

267
			dtime = now - vars->dq_tstamp;
268

269 270 271 272 273
			if (dtime == 0)
				return;

			count = count / dtime;

274 275
			if (vars->avg_dq_rate == 0)
				vars->avg_dq_rate = count;
276
			else
277 278 279
				vars->avg_dq_rate =
				    (vars->avg_dq_rate -
				     (vars->avg_dq_rate >> 3)) + (count >> 3);
280 281 282 283 284 285

			/* If the queue has receded below the threshold, we hold
			 * on to the last drain rate calculated, else we reset
			 * dq_count to 0 to re-enter the if block when the next
			 * packet is dequeued
			 */
286
			if (backlog < QUEUE_THRESHOLD) {
287
				vars->dq_count = DQCOUNT_INVALID;
288
			} else {
289 290
				vars->dq_count = 0;
				vars->dq_tstamp = psched_get_time();
291 292
			}

293
			goto burst_allowance_reduction;
294 295
		}
	}
296 297 298 299

	return;

burst_allowance_reduction:
300 301 302
	if (vars->burst_time > 0) {
		if (vars->burst_time > dtime)
			vars->burst_time -= dtime;
303
		else
304
			vars->burst_time = 0;
305
	}
306
}
307
EXPORT_SYMBOL_GPL(pie_process_dequeue);
308

309
void pie_calculate_probability(struct pie_params *params, struct pie_vars *vars,
310
			       u32 backlog)
311 312
{
	psched_time_t qdelay = 0;	/* in pschedtime */
313
	psched_time_t qdelay_old = 0;	/* in pschedtime */
314 315 316 317
	s64 delta = 0;		/* determines the change in probability */
	u64 oldprob;
	u64 alpha, beta;
	u32 power;
318 319
	bool update_prob = true;

320 321 322
	if (params->dq_rate_estimator) {
		qdelay_old = vars->qdelay;
		vars->qdelay_old = vars->qdelay;
323

324
		if (vars->avg_dq_rate > 0)
325
			qdelay = (backlog << PIE_SCALE) / vars->avg_dq_rate;
326 327 328
		else
			qdelay = 0;
	} else {
329 330
		qdelay = vars->qdelay;
		qdelay_old = vars->qdelay_old;
331
	}
332

333
	/* If qdelay is zero and backlog is not, it means backlog is very small,
334
	 * so we do not update probabilty in this round.
335
	 */
336
	if (qdelay == 0 && backlog != 0)
337 338
		update_prob = false;

339 340 341 342
	/* In the algorithm, alpha and beta are between 0 and 2 with typical
	 * value for alpha as 0.125. In this implementation, we use values 0-32
	 * passed from user space to represent this. Also, alpha and beta have
	 * unit of HZ and need to be scaled before they can used to update
343 344
	 * probability. alpha/beta are updated locally below by scaling down
	 * by 16 to come to 0-2 range.
345
	 */
346 347
	alpha = ((u64)params->alpha * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
	beta = ((u64)params->beta * (MAX_PROB / PSCHED_TICKS_PER_SEC)) >> 4;
348 349 350 351

	/* We scale alpha and beta differently depending on how heavy the
	 * congestion is. Please see RFC 8033 for details.
	 */
352
	if (vars->prob < MAX_PROB / 10) {
353 354 355 356
		alpha >>= 1;
		beta >>= 1;

		power = 100;
357
		while (vars->prob < div_u64(MAX_PROB, power) &&
358 359 360 361 362
		       power <= 1000000) {
			alpha >>= 2;
			beta >>= 2;
			power *= 10;
		}
363 364 365
	}

	/* alpha and beta should be between 0 and 32, in multiples of 1/16 */
366
	delta += alpha * (u64)(qdelay - params->target);
367
	delta += beta * (u64)(qdelay - qdelay_old);
368

369
	oldprob = vars->prob;
370 371

	/* to ensure we increase probability in steps of no more than 2% */
372
	if (delta > (s64)(MAX_PROB / (100 / 2)) &&
373
	    vars->prob >= MAX_PROB / 10)
374 375 376 377 378 379 380 381 382 383
		delta = (MAX_PROB / 100) * 2;

	/* Non-linear drop:
	 * Tune drop probability to increase quickly for high delays(>= 250ms)
	 * 250ms is derived through experiments and provides error protection
	 */

	if (qdelay > (PSCHED_NS2TICKS(250 * NSEC_PER_MSEC)))
		delta += MAX_PROB / (100 / 2);

384
	vars->prob += delta;
385 386 387

	if (delta > 0) {
		/* prevent overflow */
388 389
		if (vars->prob < oldprob) {
			vars->prob = MAX_PROB;
390 391 392 393 394 395 396 397 398
			/* Prevent normalization error. If probability is at
			 * maximum value already, we normalize it here, and
			 * skip the check to do a non-linear drop in the next
			 * section.
			 */
			update_prob = false;
		}
	} else {
		/* prevent underflow */
399 400
		if (vars->prob > oldprob)
			vars->prob = 0;
401 402 403 404 405 406
	}

	/* Non-linear drop in probability: Reduce drop probability quickly if
	 * delay is 0 for 2 consecutive Tupdate periods.
	 */

407
	if (qdelay == 0 && qdelay_old == 0 && update_prob)
408
		/* Reduce drop probability to 98.4% */
409
		vars->prob -= vars->prob / 64;
410

411
	vars->qdelay = qdelay;
412
	vars->backlog_old = backlog;
413 414 415 416

	/* We restart the measurement cycle if the following conditions are met
	 * 1. If the delay has been low for 2 consecutive Tupdate periods
	 * 2. Calculated drop probability is zero
417 418
	 * 3. If average dq_rate_estimator is enabled, we have atleast one
	 *    estimate for the avg_dq_rate ie., is a non-zero value
419
	 */
420 421 422 423 424
	if ((vars->qdelay < params->target / 2) &&
	    (vars->qdelay_old < params->target / 2) &&
	    vars->prob == 0 &&
	    (!params->dq_rate_estimator || vars->avg_dq_rate > 0)) {
		pie_vars_init(vars);
425 426
	}

427 428
	if (!params->dq_rate_estimator)
		vars->qdelay_old = qdelay;
429
}
430
EXPORT_SYMBOL_GPL(pie_calculate_probability);
431

432
static void pie_timer(struct timer_list *t)
433
{
434 435
	struct pie_sched_data *q = from_timer(q, t, adapt_timer);
	struct Qdisc *sch = q->sch;
436 437 438
	spinlock_t *root_lock = qdisc_lock(qdisc_root_sleeping(sch));

	spin_lock(root_lock);
439
	pie_calculate_probability(&q->params, &q->vars, sch->qstats.backlog);
440 441 442 443 444 445 446

	/* reset the timer to fire after 'tupdate'. tupdate is in jiffies. */
	if (q->params.tupdate)
		mod_timer(&q->adapt_timer, jiffies + q->params.tupdate);
	spin_unlock(root_lock);
}

447 448
static int pie_init(struct Qdisc *sch, struct nlattr *opt,
		    struct netlink_ext_ack *extack)
449 450 451 452 453 454 455
{
	struct pie_sched_data *q = qdisc_priv(sch);

	pie_params_init(&q->params);
	pie_vars_init(&q->vars);
	sch->limit = q->params.limit;

456 457
	q->sch = sch;
	timer_setup(&q->adapt_timer, pie_timer, 0);
458 459

	if (opt) {
460
		int err = pie_change(sch, opt, extack);
461 462 463 464 465

		if (err)
			return err;
	}

466
	mod_timer(&q->adapt_timer, jiffies + HZ / 2);
467 468 469 470 471 472 473 474
	return 0;
}

static int pie_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct pie_sched_data *q = qdisc_priv(sch);
	struct nlattr *opts;

475
	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
476
	if (!opts)
477 478 479 480
		goto nla_put_failure;

	/* convert target from pschedtime to us */
	if (nla_put_u32(skb, TCA_PIE_TARGET,
481
			((u32)PSCHED_TICKS2NS(q->params.target)) /
482 483
			NSEC_PER_USEC) ||
	    nla_put_u32(skb, TCA_PIE_LIMIT, sch->limit) ||
484 485
	    nla_put_u32(skb, TCA_PIE_TUPDATE,
			jiffies_to_usecs(q->params.tupdate)) ||
486 487 488
	    nla_put_u32(skb, TCA_PIE_ALPHA, q->params.alpha) ||
	    nla_put_u32(skb, TCA_PIE_BETA, q->params.beta) ||
	    nla_put_u32(skb, TCA_PIE_ECN, q->params.ecn) ||
489 490 491
	    nla_put_u32(skb, TCA_PIE_BYTEMODE, q->params.bytemode) ||
	    nla_put_u32(skb, TCA_PIE_DQ_RATE_ESTIMATOR,
			q->params.dq_rate_estimator))
492 493 494 495 496 497 498 499 500 501 502 503 504 505
		goto nla_put_failure;

	return nla_nest_end(skb, opts);

nla_put_failure:
	nla_nest_cancel(skb, opts);
	return -1;
}

static int pie_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
	struct pie_sched_data *q = qdisc_priv(sch);
	struct tc_pie_xstats st = {
		.prob		= q->vars.prob,
506
		.delay		= ((u32)PSCHED_TICKS2NS(q->vars.qdelay)) /
507 508 509 510 511 512 513 514
				   NSEC_PER_USEC,
		.packets_in	= q->stats.packets_in,
		.overlimit	= q->stats.overlimit,
		.maxq		= q->stats.maxq,
		.dropped	= q->stats.dropped,
		.ecn_mark	= q->stats.ecn_mark,
	};

515 516 517 518 519 520 521 522
	/* avg_dq_rate is only valid if dq_rate_estimator is enabled */
	st.dq_rate_estimating = q->params.dq_rate_estimator;

	/* unscale and return dq_rate in bytes per sec */
	if (q->params.dq_rate_estimator)
		st.avg_dq_rate = q->vars.avg_dq_rate *
				 (PSCHED_TICKS_PER_SEC) >> PIE_SCALE;

523 524 525 526 527
	return gnet_stats_copy_app(d, &st, sizeof(st));
}

static struct sk_buff *pie_qdisc_dequeue(struct Qdisc *sch)
{
528
	struct pie_sched_data *q = qdisc_priv(sch);
529
	struct sk_buff *skb = qdisc_dequeue_head(sch);
530 531 532 533

	if (!skb)
		return NULL;

534
	pie_process_dequeue(skb, &q->params, &q->vars, sch->qstats.backlog);
535 536 537 538 539 540
	return skb;
}

static void pie_reset(struct Qdisc *sch)
{
	struct pie_sched_data *q = qdisc_priv(sch);
541

542 543 544 545 546 547 548
	qdisc_reset_queue(sch);
	pie_vars_init(&q->vars);
}

static void pie_destroy(struct Qdisc *sch)
{
	struct pie_sched_data *q = qdisc_priv(sch);
549

550 551 552 553 554
	q->params.tupdate = 0;
	del_timer_sync(&q->adapt_timer);
}

static struct Qdisc_ops pie_qdisc_ops __read_mostly = {
555
	.id		= "pie",
556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585
	.priv_size	= sizeof(struct pie_sched_data),
	.enqueue	= pie_qdisc_enqueue,
	.dequeue	= pie_qdisc_dequeue,
	.peek		= qdisc_peek_dequeued,
	.init		= pie_init,
	.destroy	= pie_destroy,
	.reset		= pie_reset,
	.change		= pie_change,
	.dump		= pie_dump,
	.dump_stats	= pie_dump_stats,
	.owner		= THIS_MODULE,
};

static int __init pie_module_init(void)
{
	return register_qdisc(&pie_qdisc_ops);
}

static void __exit pie_module_exit(void)
{
	unregister_qdisc(&pie_qdisc_ops);
}

module_init(pie_module_init);
module_exit(pie_module_exit);

MODULE_DESCRIPTION("Proportional Integral controller Enhanced (PIE) scheduler");
MODULE_AUTHOR("Vijay Subramanian");
MODULE_AUTHOR("Mythili Prabhu");
MODULE_LICENSE("GPL");