sch_choke.c 12.8 KB
Newer Older
S
stephen hemminger 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
/*
 * net/sched/sch_choke.c	CHOKE scheduler
 *
 * Copyright (c) 2011 Stephen Hemminger <shemminger@vyatta.com>
 * Copyright (c) 2011 Eric Dumazet <eric.dumazet@gmail.com>
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * version 2 as published by the Free Software Foundation.
 *
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
17
#include <linux/vmalloc.h>
S
stephen hemminger 已提交
18
#include <net/pkt_sched.h>
19
#include <net/pkt_cls.h>
S
stephen hemminger 已提交
20 21
#include <net/inet_ecn.h>
#include <net/red.h>
22
#include <net/flow_dissector.h>
S
stephen hemminger 已提交
23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59

/*
   CHOKe stateless AQM for fair bandwidth allocation
   =================================================

   CHOKe (CHOose and Keep for responsive flows, CHOose and Kill for
   unresponsive flows) is a variant of RED that penalizes misbehaving flows but
   maintains no flow state. The difference from RED is an additional step
   during the enqueuing process. If average queue size is over the
   low threshold (qmin), a packet is chosen at random from the queue.
   If both the new and chosen packet are from the same flow, both
   are dropped. Unlike RED, CHOKe is not really a "classful" qdisc because it
   needs to access packets in queue randomly. It has a minimal class
   interface to allow overriding the builtin flow classifier with
   filters.

   Source:
   R. Pan, B. Prabhakar, and K. Psounis, "CHOKe, A Stateless
   Active Queue Management Scheme for Approximating Fair Bandwidth Allocation",
   IEEE INFOCOM, 2000.

   A. Tang, J. Wang, S. Low, "Understanding CHOKe: Throughput and Spatial
   Characteristics", IEEE/ACM Transactions on Networking, 2004

 */

/* Upper bound on size of sk_buff table (packets) */
#define CHOKE_MAX_QUEUE	(128*1024 - 1)

struct choke_sched_data {
/* Parameters */
	u32		 limit;
	unsigned char	 flags;

	struct red_parms parms;

/* Variables */
60
	struct red_vars  vars;
S
stephen hemminger 已提交
61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
	struct {
		u32	prob_drop;	/* Early probability drops */
		u32	prob_mark;	/* Early probability marks */
		u32	forced_drop;	/* Forced drops, qavg > max_thresh */
		u32	forced_mark;	/* Forced marks, qavg > max_thresh */
		u32	pdrop;          /* Drops due to queue limits */
		u32	other;          /* Drops due to drop() calls */
		u32	matched;	/* Drops to flow match */
	} stats;

	unsigned int	 head;
	unsigned int	 tail;

	unsigned int	 tab_mask; /* size - 1 */

	struct sk_buff **tab;
};

/* number of elements in queue including holes */
static unsigned int choke_len(const struct choke_sched_data *q)
{
	return (q->tail - q->head) & q->tab_mask;
}

/* Is ECN parameter configured */
static int use_ecn(const struct choke_sched_data *q)
{
	return q->flags & TC_RED_ECN;
}

/* Should packets over max just be dropped (versus marked) */
static int use_harddrop(const struct choke_sched_data *q)
{
	return q->flags & TC_RED_HARDDROP;
}

/* Move head pointer forward to skip over holes */
static void choke_zap_head_holes(struct choke_sched_data *q)
{
	do {
		q->head = (q->head + 1) & q->tab_mask;
		if (q->head == q->tail)
			break;
	} while (q->tab[q->head] == NULL);
}

/* Move tail pointer backwards to reuse holes */
static void choke_zap_tail_holes(struct choke_sched_data *q)
{
	do {
		q->tail = (q->tail - 1) & q->tab_mask;
		if (q->head == q->tail)
			break;
	} while (q->tab[q->tail] == NULL);
}

/* Drop packet from queue array by creating a "hole" */
118 119
static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx,
			      struct sk_buff **to_free)
S
stephen hemminger 已提交
120 121 122 123 124 125 126 127 128 129 130
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb = q->tab[idx];

	q->tab[idx] = NULL;

	if (idx == q->head)
		choke_zap_head_holes(q);
	if (idx == q->tail)
		choke_zap_tail_holes(q);

131
	qdisc_qstats_backlog_dec(sch, skb);
132
	qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
133
	qdisc_drop(skb, sch, to_free);
S
stephen hemminger 已提交
134 135 136
	--sch->q.qlen;
}

E
Eric Dumazet 已提交
137
struct choke_skb_cb {
E
Eric Dumazet 已提交
138 139
	u16			classid;
	u8			keys_valid;
T
Tom Herbert 已提交
140
	struct			flow_keys_digest keys;
E
Eric Dumazet 已提交
141 142 143 144
};

static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb)
{
145
	qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb));
E
Eric Dumazet 已提交
146 147 148
	return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data;
}

S
stephen hemminger 已提交
149 150
static inline void choke_set_classid(struct sk_buff *skb, u16 classid)
{
E
Eric Dumazet 已提交
151
	choke_skb_cb(skb)->classid = classid;
S
stephen hemminger 已提交
152 153
}

E
Eric Dumazet 已提交
154 155 156 157 158 159 160 161
/*
 * Compare flow of two packets
 *  Returns true only if source and destination address and port match.
 *          false for special cases
 */
static bool choke_match_flow(struct sk_buff *skb1,
			     struct sk_buff *skb2)
{
162 163
	struct flow_keys temp;

E
Eric Dumazet 已提交
164 165 166 167 168
	if (skb1->protocol != skb2->protocol)
		return false;

	if (!choke_skb_cb(skb1)->keys_valid) {
		choke_skb_cb(skb1)->keys_valid = 1;
169
		skb_flow_dissect_flow_keys(skb1, &temp, 0);
T
Tom Herbert 已提交
170
		make_flow_keys_digest(&choke_skb_cb(skb1)->keys, &temp);
E
Eric Dumazet 已提交
171 172 173 174
	}

	if (!choke_skb_cb(skb2)->keys_valid) {
		choke_skb_cb(skb2)->keys_valid = 1;
175
		skb_flow_dissect_flow_keys(skb2, &temp, 0);
T
Tom Herbert 已提交
176
		make_flow_keys_digest(&choke_skb_cb(skb2)->keys, &temp);
E
Eric Dumazet 已提交
177 178 179 180
	}

	return !memcmp(&choke_skb_cb(skb1)->keys,
		       &choke_skb_cb(skb2)->keys,
T
Tom Herbert 已提交
181
		       sizeof(choke_skb_cb(skb1)->keys));
E
Eric Dumazet 已提交
182 183
}

S
stephen hemminger 已提交
184 185 186 187 188 189 190 191 192 193 194 195 196
/*
 * Select a packet at random from queue
 * HACK: since queue can have holes from previous deletion; retry several
 *   times to find a random skb but then just give up and return the head
 * Will return NULL if queue is empty (q->head == q->tail)
 */
static struct sk_buff *choke_peek_random(const struct choke_sched_data *q,
					 unsigned int *pidx)
{
	struct sk_buff *skb;
	int retrys = 3;

	do {
197
		*pidx = (q->head + prandom_u32_max(choke_len(q))) & q->tab_mask;
S
stephen hemminger 已提交
198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
		skb = q->tab[*pidx];
		if (skb)
			return skb;
	} while (--retrys > 0);

	return q->tab[*pidx = q->head];
}

/*
 * Compare new packet with random packet in queue
 * returns true if matched and sets *pidx
 */
static bool choke_match_random(const struct choke_sched_data *q,
			       struct sk_buff *nskb,
			       unsigned int *pidx)
{
	struct sk_buff *oskb;

	if (q->head == q->tail)
		return false;

	oskb = choke_peek_random(q, pidx);
	return choke_match_flow(oskb, nskb);
}

223 224
static int choke_enqueue(struct sk_buff *skb, struct Qdisc *sch,
			 struct sk_buff **to_free)
S
stephen hemminger 已提交
225
{
J
John Fastabend 已提交
226
	int ret = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
S
stephen hemminger 已提交
227
	struct choke_sched_data *q = qdisc_priv(sch);
228
	const struct red_parms *p = &q->parms;
S
stephen hemminger 已提交
229

E
Eric Dumazet 已提交
230
	choke_skb_cb(skb)->keys_valid = 0;
S
stephen hemminger 已提交
231
	/* Compute average queue usage (see RED) */
232 233 234
	q->vars.qavg = red_calc_qavg(p, &q->vars, sch->q.qlen);
	if (red_is_idling(&q->vars))
		red_end_of_idle_period(&q->vars);
S
stephen hemminger 已提交
235 236

	/* Is queue small? */
237 238
	if (q->vars.qavg <= p->qth_min)
		q->vars.qcount = -1;
S
stephen hemminger 已提交
239 240 241 242 243 244
	else {
		unsigned int idx;

		/* Draw a packet at random from queue and compare flow */
		if (choke_match_random(q, skb, &idx)) {
			q->stats.matched++;
245
			choke_drop_by_idx(sch, idx, to_free);
S
stephen hemminger 已提交
246 247 248 249
			goto congestion_drop;
		}

		/* Queue is large, always mark/drop */
250 251
		if (q->vars.qavg > p->qth_max) {
			q->vars.qcount = -1;
S
stephen hemminger 已提交
252

253
			qdisc_qstats_overlimit(sch);
S
stephen hemminger 已提交
254 255 256 257 258 259 260
			if (use_harddrop(q) || !use_ecn(q) ||
			    !INET_ECN_set_ce(skb)) {
				q->stats.forced_drop++;
				goto congestion_drop;
			}

			q->stats.forced_mark++;
261 262 263 264
		} else if (++q->vars.qcount) {
			if (red_mark_probability(p, &q->vars, q->vars.qavg)) {
				q->vars.qcount = 0;
				q->vars.qR = red_random(p);
S
stephen hemminger 已提交
265

266
				qdisc_qstats_overlimit(sch);
S
stephen hemminger 已提交
267 268 269 270 271 272 273 274
				if (!use_ecn(q) || !INET_ECN_set_ce(skb)) {
					q->stats.prob_drop++;
					goto congestion_drop;
				}

				q->stats.prob_mark++;
			}
		} else
275
			q->vars.qR = red_random(p);
S
stephen hemminger 已提交
276 277 278 279 280 281 282
	}

	/* Admit new packet */
	if (sch->q.qlen < q->limit) {
		q->tab[q->tail] = skb;
		q->tail = (q->tail + 1) & q->tab_mask;
		++sch->q.qlen;
283
		qdisc_qstats_backlog_inc(sch, skb);
S
stephen hemminger 已提交
284 285 286 287
		return NET_XMIT_SUCCESS;
	}

	q->stats.pdrop++;
288
	return qdisc_drop(skb, sch, to_free);
S
stephen hemminger 已提交
289

290
congestion_drop:
291
	qdisc_drop(skb, sch, to_free);
S
stephen hemminger 已提交
292 293 294
	return NET_XMIT_CN;

	if (ret & __NET_XMIT_BYPASS)
295
		qdisc_qstats_drop(sch);
296
	__qdisc_drop(skb, to_free);
S
stephen hemminger 已提交
297 298 299 300 301 302 303 304 305
	return ret;
}

static struct sk_buff *choke_dequeue(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;

	if (q->head == q->tail) {
306 307
		if (!red_is_idling(&q->vars))
			red_start_of_idle_period(&q->vars);
S
stephen hemminger 已提交
308 309 310 311 312 313 314
		return NULL;
	}

	skb = q->tab[q->head];
	q->tab[q->head] = NULL;
	choke_zap_head_holes(q);
	--sch->q.qlen;
315
	qdisc_qstats_backlog_dec(sch, skb);
S
stephen hemminger 已提交
316 317 318 319 320 321 322 323 324
	qdisc_bstats_update(sch, skb);

	return skb;
}

static void choke_reset(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);

325 326 327 328 329 330
	while (q->head != q->tail) {
		struct sk_buff *skb = q->tab[q->head];

		q->head = (q->head + 1) & q->tab_mask;
		if (!skb)
			continue;
331
		rtnl_qdisc_drop(skb, sch);
332 333
	}

334 335
	sch->q.qlen = 0;
	sch->qstats.backlog = 0;
336 337
	memset(q->tab, 0, (q->tab_mask + 1) * sizeof(struct sk_buff *));
	q->head = q->tail = 0;
338
	red_restart(&q->vars);
S
stephen hemminger 已提交
339 340 341 342 343
}

static const struct nla_policy choke_policy[TCA_CHOKE_MAX + 1] = {
	[TCA_CHOKE_PARMS]	= { .len = sizeof(struct tc_red_qopt) },
	[TCA_CHOKE_STAB]	= { .len = RED_STAB_SIZE },
344
	[TCA_CHOKE_MAX_P]	= { .type = NLA_U32 },
S
stephen hemminger 已提交
345 346 347 348 349
};


static void choke_free(void *addr)
{
W
WANG Cong 已提交
350
	kvfree(addr);
S
stephen hemminger 已提交
351 352 353 354 355 356 357 358 359 360
}

static int choke_change(struct Qdisc *sch, struct nlattr *opt)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct nlattr *tb[TCA_CHOKE_MAX + 1];
	const struct tc_red_qopt *ctl;
	int err;
	struct sk_buff **old = NULL;
	unsigned int mask;
361
	u32 max_P;
S
stephen hemminger 已提交
362 363 364 365 366 367 368 369 370 371 372 373

	if (opt == NULL)
		return -EINVAL;

	err = nla_parse_nested(tb, TCA_CHOKE_MAX, opt, choke_policy);
	if (err < 0)
		return err;

	if (tb[TCA_CHOKE_PARMS] == NULL ||
	    tb[TCA_CHOKE_STAB] == NULL)
		return -EINVAL;

374 375
	max_P = tb[TCA_CHOKE_MAX_P] ? nla_get_u32(tb[TCA_CHOKE_MAX_P]) : 0;

S
stephen hemminger 已提交
376 377 378 379 380 381 382 383 384
	ctl = nla_data(tb[TCA_CHOKE_PARMS]);

	if (ctl->limit > CHOKE_MAX_QUEUE)
		return -EINVAL;

	mask = roundup_pow_of_two(ctl->limit + 1) - 1;
	if (mask != q->tab_mask) {
		struct sk_buff **ntab;

385 386
		ntab = kcalloc(mask + 1, sizeof(struct sk_buff *),
			       GFP_KERNEL | __GFP_NOWARN);
S
stephen hemminger 已提交
387 388 389 390 391 392 393 394 395
		if (!ntab)
			ntab = vzalloc((mask + 1) * sizeof(struct sk_buff *));
		if (!ntab)
			return -ENOMEM;

		sch_tree_lock(sch);
		old = q->tab;
		if (old) {
			unsigned int oqlen = sch->q.qlen, tail = 0;
396
			unsigned dropped = 0;
S
stephen hemminger 已提交
397 398 399 400 401 402 403 404 405 406 407

			while (q->head != q->tail) {
				struct sk_buff *skb = q->tab[q->head];

				q->head = (q->head + 1) & q->tab_mask;
				if (!skb)
					continue;
				if (tail < mask) {
					ntab[tail++] = skb;
					continue;
				}
408
				dropped += qdisc_pkt_len(skb);
409
				qdisc_qstats_backlog_dec(sch, skb);
S
stephen hemminger 已提交
410
				--sch->q.qlen;
411
				rtnl_qdisc_drop(skb, sch);
S
stephen hemminger 已提交
412
			}
413
			qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
S
stephen hemminger 已提交
414 415 416 417 418 419 420 421 422 423 424 425 426 427
			q->head = 0;
			q->tail = tail;
		}

		q->tab_mask = mask;
		q->tab = ntab;
	} else
		sch_tree_lock(sch);

	q->flags = ctl->flags;
	q->limit = ctl->limit;

	red_set_parms(&q->parms, ctl->qth_min, ctl->qth_max, ctl->Wlog,
		      ctl->Plog, ctl->Scell_log,
428 429
		      nla_data(tb[TCA_CHOKE_STAB]),
		      max_P);
430
	red_set_vars(&q->vars);
S
stephen hemminger 已提交
431 432

	if (q->head == q->tail)
433
		red_end_of_idle_period(&q->vars);
S
stephen hemminger 已提交
434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462

	sch_tree_unlock(sch);
	choke_free(old);
	return 0;
}

static int choke_init(struct Qdisc *sch, struct nlattr *opt)
{
	return choke_change(sch, opt);
}

static int choke_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct nlattr *opts = NULL;
	struct tc_red_qopt opt = {
		.limit		= q->limit,
		.flags		= q->flags,
		.qth_min	= q->parms.qth_min >> q->parms.Wlog,
		.qth_max	= q->parms.qth_max >> q->parms.Wlog,
		.Wlog		= q->parms.Wlog,
		.Plog		= q->parms.Plog,
		.Scell_log	= q->parms.Scell_log,
	};

	opts = nla_nest_start(skb, TCA_OPTIONS);
	if (opts == NULL)
		goto nla_put_failure;

463 464 465
	if (nla_put(skb, TCA_CHOKE_PARMS, sizeof(opt), &opt) ||
	    nla_put_u32(skb, TCA_CHOKE_MAX_P, q->parms.max_P))
		goto nla_put_failure;
S
stephen hemminger 已提交
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
	return nla_nest_end(skb, opts);

nla_put_failure:
	nla_nest_cancel(skb, opts);
	return -EMSGSIZE;
}

static int choke_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
	struct choke_sched_data *q = qdisc_priv(sch);
	struct tc_choke_xstats st = {
		.early	= q->stats.prob_drop + q->stats.forced_drop,
		.marked	= q->stats.prob_mark + q->stats.forced_mark,
		.pdrop	= q->stats.pdrop,
		.other	= q->stats.other,
		.matched = q->stats.matched,
	};

	return gnet_stats_copy_app(d, &st, sizeof(st));
}

static void choke_destroy(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);

	choke_free(q->tab);
}

static struct sk_buff *choke_peek_head(struct Qdisc *sch)
{
	struct choke_sched_data *q = qdisc_priv(sch);

	return (q->head != q->tail) ? q->tab[q->head] : NULL;
}

static struct Qdisc_ops choke_qdisc_ops __read_mostly = {
	.id		=	"choke",
	.priv_size	=	sizeof(struct choke_sched_data),

	.enqueue	=	choke_enqueue,
	.dequeue	=	choke_dequeue,
	.peek		=	choke_peek_head,
	.init		=	choke_init,
	.destroy	=	choke_destroy,
	.reset		=	choke_reset,
	.change		=	choke_change,
	.dump		=	choke_dump,
	.dump_stats	=	choke_dump_stats,
	.owner		=	THIS_MODULE,
};

static int __init choke_module_init(void)
{
	return register_qdisc(&choke_qdisc_ops);
}

static void __exit choke_module_exit(void)
{
	unregister_qdisc(&choke_qdisc_ops);
}

module_init(choke_module_init)
module_exit(choke_module_exit)

MODULE_LICENSE("GPL");