sch_cbq.c 43.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/*
 * net/sched/sch_cbq.c	Class-Based Queueing discipline.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *
 */

#include <linux/module.h>
14
#include <linux/slab.h>
L
Linus Torvalds 已提交
15 16 17 18 19
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
20
#include <net/netlink.h>
L
Linus Torvalds 已提交
21 22 23 24 25 26 27
#include <net/pkt_sched.h>


/*	Class-Based Queueing (CBQ) algorithm.
	=======================================

	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
28
		 Management Models for Packet Networks",
L
Linus Torvalds 已提交
29 30
		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995

31
		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
L
Linus Torvalds 已提交
32

33
		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
L
Linus Torvalds 已提交
34 35 36 37 38 39 40 41 42 43 44 45 46
		 Parameters", 1996

		 [4] Sally Floyd and Michael Speer, "Experimental Results
		 for Class-Based Queueing", 1998, not published.

	-----------------------------------------------------------------------

	Algorithm skeleton was taken from NS simulator cbq.cc.
	If someone wants to check this code against the LBL version,
	he should take into account that ONLY the skeleton was borrowed,
	the implementation is different. Particularly:

	--- The WRR algorithm is different. Our version looks more
47 48 49 50 51 52
	reasonable (I hope) and works when quanta are allowed to be
	less than MTU, which is always the case when real time classes
	have small rates. Note, that the statement of [3] is
	incomplete, delay may actually be estimated even if class
	per-round allotment is less than MTU. Namely, if per-round
	allotment is W*r_i, and r_1+...+r_k = r < 1
L
Linus Torvalds 已提交
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74

	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B

	In the worst case we have IntServ estimate with D = W*r+k*MTU
	and C = MTU*r. The proof (if correct at all) is trivial.


	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
	interpret some places, which look like wrong translations
	from NS. Anyone is advised to find these differences
	and explain to me, why I am wrong 8).

	--- Linux has no EOI event, so that we cannot estimate true class
	idle time. Workaround is to consider the next dequeue event
	as sign that previous packet is finished. This is wrong because of
	internal device queueing, but on a permanently loaded link it is true.
	Moreover, combined with clock integrator, this scheme looks
	very close to an ideal solution.  */

struct cbq_sched_data;


E
Eric Dumazet 已提交
75
struct cbq_class {
76
	struct Qdisc_class_common common;
L
Linus Torvalds 已提交
77 78 79 80 81 82
	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */

/* Parameters */
	unsigned char		priority;	/* class priority */
	unsigned char		priority2;	/* priority to be used after overlimit */
	unsigned char		ewma_log;	/* time constant for idle time calculation */
83
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
	unsigned char		police;
#endif

	u32			defmap;

	/* Link-sharing scheduler parameters */
	long			maxidle;	/* Class parameters: see below. */
	long			offtime;
	long			minidle;
	u32			avpkt;
	struct qdisc_rate_table	*R_tab;

	/* General scheduler (WRR) parameters */
	long			allot;
	long			quantum;	/* Allotment per WRR round */
	long			weight;		/* Relative allotment: see below */

	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
	struct cbq_class	*split;		/* Ptr to split node */
	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
						   parent otherwise */
	struct cbq_class	*sibling;	/* Sibling chain */
	struct cbq_class	*children;	/* Pointer to children chain */

	struct Qdisc		*q;		/* Elementary queueing discipline */


/* Variables */
	unsigned char		cpriority;	/* Effective priority */
	unsigned char		delayed;
	unsigned char		level;		/* level of the class in hierarchy:
						   0 for leaf classes, and maximal
						   level of children + 1 for nodes.
						 */

	psched_time_t		last;		/* Last end of service */
	psched_time_t		undertime;
	long			avgidle;
	long			deficit;	/* Saved deficit for WRR */
125
	psched_time_t		penalized;
126
	struct gnet_stats_basic_packed bstats;
L
Linus Torvalds 已提交
127
	struct gnet_stats_queue qstats;
128
	struct gnet_stats_rate_est64 rate_est;
L
Linus Torvalds 已提交
129 130
	struct tc_cbq_xstats	xstats;

J
John Fastabend 已提交
131
	struct tcf_proto __rcu	*filter_list;
L
Linus Torvalds 已提交
132 133 134 135

	int			refcnt;
	int			filters;

E
Eric Dumazet 已提交
136
	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
L
Linus Torvalds 已提交
137 138
};

E
Eric Dumazet 已提交
139
struct cbq_sched_data {
140
	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
E
Eric Dumazet 已提交
141 142
	int			nclasses[TC_CBQ_MAXPRIO + 1];
	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
L
Linus Torvalds 已提交
143 144 145

	struct cbq_class	link;

E
Eric Dumazet 已提交
146 147
	unsigned int		activemask;
	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
L
Linus Torvalds 已提交
148 149
								   with backlog */

150
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
151 152 153 154 155 156
	struct cbq_class	*rx_class;
#endif
	struct cbq_class	*tx_class;
	struct cbq_class	*tx_borrowed;
	int			tx_len;
	psched_time_t		now;		/* Cached timestamp */
E
Eric Dumazet 已提交
157
	unsigned int		pmask;
L
Linus Torvalds 已提交
158

159
	struct hrtimer		delay_timer;
160
	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
L
Linus Torvalds 已提交
161 162 163
						   started when CBQ has
						   backlog, but cannot
						   transmit just now */
164
	psched_tdiff_t		wd_expires;
L
Linus Torvalds 已提交
165 166 167 168 169
	int			toplevel;
	u32			hgenerator;
};


E
Eric Dumazet 已提交
170
#define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
L
Linus Torvalds 已提交
171

E
Eric Dumazet 已提交
172
static inline struct cbq_class *
L
Linus Torvalds 已提交
173 174
cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
{
175
	struct Qdisc_class_common *clc;
L
Linus Torvalds 已提交
176

177 178 179 180
	clc = qdisc_class_find(&q->clhash, classid);
	if (clc == NULL)
		return NULL;
	return container_of(clc, struct cbq_class, common);
L
Linus Torvalds 已提交
181 182
}

183
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
184 185 186 187

static struct cbq_class *
cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
{
E
Eric Dumazet 已提交
188
	struct cbq_class *cl;
L
Linus Torvalds 已提交
189

E
Eric Dumazet 已提交
190 191
	for (cl = this->tparent; cl; cl = cl->tparent) {
		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
L
Linus Torvalds 已提交
192

E
Eric Dumazet 已提交
193 194 195
		if (new != NULL && new != this)
			return new;
	}
L
Linus Torvalds 已提交
196 197 198 199 200 201
	return NULL;
}

#endif

/* Classify packet. The procedure is pretty complicated, but
E
Eric Dumazet 已提交
202 203 204 205 206 207 208
 * it allows us to combine link sharing and priority scheduling
 * transparently.
 *
 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 * so that it resolves to split nodes. Then packets are classified
 * by logical priority, or a more specific classifier may be attached
 * to the split node.
L
Linus Torvalds 已提交
209 210 211 212 213 214 215 216 217 218
 */

static struct cbq_class *
cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
	struct cbq_class *head = &q->link;
	struct cbq_class **defmap;
	struct cbq_class *cl = NULL;
	u32 prio = skb->priority;
J
John Fastabend 已提交
219
	struct tcf_proto *fl;
L
Linus Torvalds 已提交
220 221 222 223 224
	struct tcf_result res;

	/*
	 *  Step 1. If skb->priority points to one of our classes, use it.
	 */
E
Eric Dumazet 已提交
225
	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
L
Linus Torvalds 已提交
226 227 228
	    (cl = cbq_class_lookup(q, prio)) != NULL)
		return cl;

229
	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
L
Linus Torvalds 已提交
230 231 232 233
	for (;;) {
		int result = 0;
		defmap = head->defaults;

J
John Fastabend 已提交
234
		fl = rcu_dereference_bh(head->filter_list);
L
Linus Torvalds 已提交
235 236 237
		/*
		 * Step 2+n. Apply classifier.
		 */
238
		result = tc_classify(skb, fl, &res, true);
J
John Fastabend 已提交
239
		if (!fl || result < 0)
L
Linus Torvalds 已提交
240 241
			goto fallback;

E
Eric Dumazet 已提交
242 243
		cl = (void *)res.class;
		if (!cl) {
L
Linus Torvalds 已提交
244 245
			if (TC_H_MAJ(res.classid))
				cl = cbq_class_lookup(q, res.classid);
E
Eric Dumazet 已提交
246
			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
L
Linus Torvalds 已提交
247 248
				cl = defmap[TC_PRIO_BESTEFFORT];

249
			if (cl == NULL)
L
Linus Torvalds 已提交
250 251
				goto fallback;
		}
252 253
		if (cl->level >= head->level)
			goto fallback;
L
Linus Torvalds 已提交
254 255 256
#ifdef CONFIG_NET_CLS_ACT
		switch (result) {
		case TC_ACT_QUEUED:
257
		case TC_ACT_STOLEN:
258
			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
L
Linus Torvalds 已提交
259 260
		case TC_ACT_SHOT:
			return NULL;
261 262
		case TC_ACT_RECLASSIFY:
			return cbq_reclassify(skb, cl);
L
Linus Torvalds 已提交
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282
		}
#endif
		if (cl->level == 0)
			return cl;

		/*
		 * Step 3+n. If classifier selected a link sharing class,
		 *	   apply agency specific classifier.
		 *	   Repeat this procdure until we hit a leaf node.
		 */
		head = cl;
	}

fallback:
	cl = head;

	/*
	 * Step 4. No success...
	 */
	if (TC_H_MAJ(prio) == 0 &&
E
Eric Dumazet 已提交
283
	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
L
Linus Torvalds 已提交
284 285 286 287 288 289 290
	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
		return head;

	return cl;
}

/*
E
Eric Dumazet 已提交
291 292 293
 * A packet has just been enqueued on the empty class.
 * cbq_activate_class adds it to the tail of active class list
 * of its priority band.
L
Linus Torvalds 已提交
294 295
 */

E
Eric Dumazet 已提交
296
static inline void cbq_activate_class(struct cbq_class *cl)
L
Linus Torvalds 已提交
297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314
{
	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
	int prio = cl->cpriority;
	struct cbq_class *cl_tail;

	cl_tail = q->active[prio];
	q->active[prio] = cl;

	if (cl_tail != NULL) {
		cl->next_alive = cl_tail->next_alive;
		cl_tail->next_alive = cl;
	} else {
		cl->next_alive = cl;
		q->activemask |= (1<<prio);
	}
}

/*
E
Eric Dumazet 已提交
315 316 317
 * Unlink class from active chain.
 * Note that this same procedure is done directly in cbq_dequeue*
 * during round-robin procedure.
L
Linus Torvalds 已提交
318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350
 */

static void cbq_deactivate_class(struct cbq_class *this)
{
	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
	int prio = this->cpriority;
	struct cbq_class *cl;
	struct cbq_class *cl_prev = q->active[prio];

	do {
		cl = cl_prev->next_alive;
		if (cl == this) {
			cl_prev->next_alive = cl->next_alive;
			cl->next_alive = NULL;

			if (cl == q->active[prio]) {
				q->active[prio] = cl_prev;
				if (cl == q->active[prio]) {
					q->active[prio] = NULL;
					q->activemask &= ~(1<<prio);
					return;
				}
			}
			return;
		}
	} while ((cl_prev = cl) != q->active[prio]);
}

static void
cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
{
	int toplevel = q->toplevel;

351
	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
V
Vasily Averin 已提交
352
		psched_time_t now = psched_get_time();
L
Linus Torvalds 已提交
353 354

		do {
P
Patrick McHardy 已提交
355
			if (cl->undertime < now) {
L
Linus Torvalds 已提交
356 357 358
				q->toplevel = cl->level;
				return;
			}
E
Eric Dumazet 已提交
359
		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
L
Linus Torvalds 已提交
360 361 362 363 364 365 366
	}
}

static int
cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
367
	int uninitialized_var(ret);
L
Linus Torvalds 已提交
368 369
	struct cbq_class *cl = cbq_classify(skb, sch, &ret);

370
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
371 372 373
	q->rx_class = cl;
#endif
	if (cl == NULL) {
374
		if (ret & __NET_XMIT_BYPASS)
375
			qdisc_qstats_drop(sch);
L
Linus Torvalds 已提交
376 377 378 379
		kfree_skb(skb);
		return ret;
	}

380
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
381 382
	cl->q->__parent = sch;
#endif
383 384
	ret = qdisc_enqueue(skb, cl->q);
	if (ret == NET_XMIT_SUCCESS) {
L
Linus Torvalds 已提交
385 386 387 388 389 390 391
		sch->q.qlen++;
		cbq_mark_toplevel(q, cl);
		if (!cl->next_alive)
			cbq_activate_class(cl);
		return ret;
	}

392
	if (net_xmit_drop_count(ret)) {
393
		qdisc_qstats_drop(sch);
394 395 396
		cbq_mark_toplevel(q, cl);
		cl->qstats.drops++;
	}
L
Linus Torvalds 已提交
397 398 399
	return ret;
}

400 401
/* Overlimit action: penalize leaf class by adding offtime */
static void cbq_overlimit(struct cbq_class *cl)
L
Linus Torvalds 已提交
402 403
{
	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
P
Patrick McHardy 已提交
404
	psched_tdiff_t delay = cl->undertime - q->now;
L
Linus Torvalds 已提交
405 406 407 408

	if (!cl->delayed) {
		delay += cl->offtime;

409
		/*
E
Eric Dumazet 已提交
410 411 412 413 414
		 * Class goes to sleep, so that it will have no
		 * chance to work avgidle. Let's forgive it 8)
		 *
		 * BTW cbq-2.0 has a crap in this
		 * place, apparently they forgot to shift it by cl->ewma_log.
L
Linus Torvalds 已提交
415 416 417 418 419 420 421
		 */
		if (cl->avgidle < 0)
			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
		if (cl->avgidle < cl->minidle)
			cl->avgidle = cl->minidle;
		if (delay <= 0)
			delay = 1;
422
		cl->undertime = q->now + delay;
L
Linus Torvalds 已提交
423 424 425 426 427 428 429 430

		cl->xstats.overactions++;
		cl->delayed = 1;
	}
	if (q->wd_expires == 0 || q->wd_expires > delay)
		q->wd_expires = delay;

	/* Dirty work! We must schedule wakeups based on
E
Eric Dumazet 已提交
431 432
	 * real available rate, rather than leaf rate,
	 * which may be tiny (even zero).
L
Linus Torvalds 已提交
433 434 435 436 437 438
	 */
	if (q->toplevel == TC_CBQ_MAXLEVEL) {
		struct cbq_class *b;
		psched_tdiff_t base_delay = q->wd_expires;

		for (b = cl->borrow; b; b = b->borrow) {
P
Patrick McHardy 已提交
439
			delay = b->undertime - q->now;
L
Linus Torvalds 已提交
440 441 442 443 444 445 446 447 448 449 450
			if (delay < base_delay) {
				if (delay <= 0)
					delay = 1;
				base_delay = delay;
			}
		}

		q->wd_expires = base_delay;
	}
}

451 452
static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
				       psched_time_t now)
L
Linus Torvalds 已提交
453 454 455
{
	struct cbq_class *cl;
	struct cbq_class *cl_prev = q->active[prio];
456
	psched_time_t sched = now;
L
Linus Torvalds 已提交
457 458

	if (cl_prev == NULL)
459
		return 0;
L
Linus Torvalds 已提交
460 461 462

	do {
		cl = cl_prev->next_alive;
463
		if (now - cl->penalized > 0) {
L
Linus Torvalds 已提交
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478
			cl_prev->next_alive = cl->next_alive;
			cl->next_alive = NULL;
			cl->cpriority = cl->priority;
			cl->delayed = 0;
			cbq_activate_class(cl);

			if (cl == q->active[prio]) {
				q->active[prio] = cl_prev;
				if (cl == q->active[prio]) {
					q->active[prio] = NULL;
					return 0;
				}
			}

			cl = cl_prev->next_alive;
479
		} else if (sched - cl->penalized > 0)
L
Linus Torvalds 已提交
480 481 482
			sched = cl->penalized;
	} while ((cl_prev = cl) != q->active[prio]);

483
	return sched - now;
L
Linus Torvalds 已提交
484 485
}

486
static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
L
Linus Torvalds 已提交
487
{
488
	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
489
						delay_timer);
490 491 492
	struct Qdisc *sch = q->watchdog.qdisc;
	psched_time_t now;
	psched_tdiff_t delay = 0;
E
Eric Dumazet 已提交
493
	unsigned int pmask;
L
Linus Torvalds 已提交
494

495
	now = psched_get_time();
496

L
Linus Torvalds 已提交
497 498 499 500 501
	pmask = q->pmask;
	q->pmask = 0;

	while (pmask) {
		int prio = ffz(~pmask);
502
		psched_tdiff_t tmp;
L
Linus Torvalds 已提交
503 504 505

		pmask &= ~(1<<prio);

506
		tmp = cbq_undelay_prio(q, prio, now);
L
Linus Torvalds 已提交
507 508 509 510 511 512 513 514
		if (tmp > 0) {
			q->pmask |= 1<<prio;
			if (tmp < delay || delay == 0)
				delay = tmp;
		}
	}

	if (delay) {
515 516 517
		ktime_t time;

		time = ktime_set(0, 0);
518
		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
E
Eric Dumazet 已提交
519
		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
L
Linus Torvalds 已提交
520 521
	}

522
	qdisc_unthrottled(sch);
523
	__netif_schedule(qdisc_root(sch));
524
	return HRTIMER_NORESTART;
L
Linus Torvalds 已提交
525 526
}

527
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
528 529 530 531 532 533 534 535 536
static int cbq_reshape_fail(struct sk_buff *skb, struct Qdisc *child)
{
	struct Qdisc *sch = child->__parent;
	struct cbq_sched_data *q = qdisc_priv(sch);
	struct cbq_class *cl = q->rx_class;

	q->rx_class = NULL;

	if (cl && (cl = cbq_reclassify(skb, cl)) != NULL) {
537
		int ret;
L
Linus Torvalds 已提交
538 539 540 541 542 543

		cbq_mark_toplevel(q, cl);

		q->rx_class = cl;
		cl->q->__parent = sch;

544 545
		ret = qdisc_enqueue(skb, cl->q);
		if (ret == NET_XMIT_SUCCESS) {
L
Linus Torvalds 已提交
546 547 548 549 550
			sch->q.qlen++;
			if (!cl->next_alive)
				cbq_activate_class(cl);
			return 0;
		}
551
		if (net_xmit_drop_count(ret))
552
			qdisc_qstats_drop(sch);
L
Linus Torvalds 已提交
553 554 555
		return 0;
	}

556
	qdisc_qstats_drop(sch);
L
Linus Torvalds 已提交
557 558 559 560
	return -1;
}
#endif

561
/*
E
Eric Dumazet 已提交
562 563 564 565 566 567 568
 * It is mission critical procedure.
 *
 * We "regenerate" toplevel cutoff, if transmitting class
 * has backlog and it is not regulated. It is not part of
 * original CBQ description, but looks more reasonable.
 * Probably, it is wrong. This question needs further investigation.
 */
L
Linus Torvalds 已提交
569

E
Eric Dumazet 已提交
570
static inline void
L
Linus Torvalds 已提交
571 572 573 574 575 576
cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
		    struct cbq_class *borrowed)
{
	if (cl && q->toplevel >= borrowed->level) {
		if (cl->q->q.qlen > 1) {
			do {
577
				if (borrowed->undertime == PSCHED_PASTPERFECT) {
L
Linus Torvalds 已提交
578 579 580
					q->toplevel = borrowed->level;
					return;
				}
E
Eric Dumazet 已提交
581
			} while ((borrowed = borrowed->borrow) != NULL);
L
Linus Torvalds 已提交
582
		}
583
#if 0
L
Linus Torvalds 已提交
584 585 586 587 588 589 590 591 592 593 594 595 596 597
	/* It is not necessary now. Uncommenting it
	   will save CPU cycles, but decrease fairness.
	 */
		q->toplevel = TC_CBQ_MAXLEVEL;
#endif
	}
}

static void
cbq_update(struct cbq_sched_data *q)
{
	struct cbq_class *this = q->tx_class;
	struct cbq_class *cl = this;
	int len = q->tx_len;
598
	psched_time_t now;
L
Linus Torvalds 已提交
599 600

	q->tx_class = NULL;
601 602 603 604
	/* Time integrator. We calculate EOS time
	 * by adding expected packet transmission time.
	 */
	now = q->now + L2T(&q->link, len);
L
Linus Torvalds 已提交
605 606 607 608 609 610 611 612 613

	for ( ; cl; cl = cl->share) {
		long avgidle = cl->avgidle;
		long idle;

		cl->bstats.packets++;
		cl->bstats.bytes += len;

		/*
E
Eric Dumazet 已提交
614 615 616 617
		 * (now - last) is total time between packet right edges.
		 * (last_pktlen/rate) is "virtual" busy time, so that
		 *
		 *	idle = (now - last) - last_pktlen/rate
L
Linus Torvalds 已提交
618 619
		 */

620
		idle = now - cl->last;
L
Linus Torvalds 已提交
621 622 623 624 625 626
		if ((unsigned long)idle > 128*1024*1024) {
			avgidle = cl->maxidle;
		} else {
			idle -= L2T(cl, len);

		/* true_avgidle := (1-W)*true_avgidle + W*idle,
E
Eric Dumazet 已提交
627 628 629
		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
		 * cl->avgidle == true_avgidle/W,
		 * hence:
L
Linus Torvalds 已提交
630 631 632 633 634 635 636 637 638 639 640 641 642
		 */
			avgidle += idle - (avgidle>>cl->ewma_log);
		}

		if (avgidle <= 0) {
			/* Overlimit or at-limit */

			if (avgidle < cl->minidle)
				avgidle = cl->minidle;

			cl->avgidle = avgidle;

			/* Calculate expected time, when this class
E
Eric Dumazet 已提交
643 644 645 646 647 648
			 * will be allowed to send.
			 * It will occur, when:
			 * (1-W)*true_avgidle + W*delay = 0, i.e.
			 * idle = (1/W - 1)*(-true_avgidle)
			 * or
			 * idle = (1 - W)*(-cl->avgidle);
L
Linus Torvalds 已提交
649 650 651 652
			 */
			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);

			/*
E
Eric Dumazet 已提交
653 654 655 656 657 658
			 * That is not all.
			 * To maintain the rate allocated to the class,
			 * we add to undertime virtual clock,
			 * necessary to complete transmitted packet.
			 * (len/phys_bandwidth has been already passed
			 * to the moment of cbq_update)
L
Linus Torvalds 已提交
659 660 661 662 663
			 */

			idle -= L2T(&q->link, len);
			idle += L2T(cl, len);

664
			cl->undertime = now + idle;
L
Linus Torvalds 已提交
665 666 667
		} else {
			/* Underlimit */

668
			cl->undertime = PSCHED_PASTPERFECT;
L
Linus Torvalds 已提交
669 670 671 672 673
			if (avgidle > cl->maxidle)
				cl->avgidle = cl->maxidle;
			else
				cl->avgidle = avgidle;
		}
674 675
		if ((s64)(now - cl->last) > 0)
			cl->last = now;
L
Linus Torvalds 已提交
676 677 678 679 680
	}

	cbq_update_toplevel(q, this, q->tx_borrowed);
}

E
Eric Dumazet 已提交
681
static inline struct cbq_class *
L
Linus Torvalds 已提交
682 683 684 685 686 687 688 689
cbq_under_limit(struct cbq_class *cl)
{
	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
	struct cbq_class *this_cl = cl;

	if (cl->tparent == NULL)
		return cl;

690
	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
L
Linus Torvalds 已提交
691 692 693 694 695 696
		cl->delayed = 0;
		return cl;
	}

	do {
		/* It is very suspicious place. Now overlimit
E
Eric Dumazet 已提交
697 698 699 700 701 702 703 704
		 * action is generated for not bounded classes
		 * only if link is completely congested.
		 * Though it is in agree with ancestor-only paradigm,
		 * it looks very stupid. Particularly,
		 * it means that this chunk of code will either
		 * never be called or result in strong amplification
		 * of burstiness. Dangerous, silly, and, however,
		 * no another solution exists.
L
Linus Torvalds 已提交
705
		 */
E
Eric Dumazet 已提交
706 707
		cl = cl->borrow;
		if (!cl) {
L
Linus Torvalds 已提交
708
			this_cl->qstats.overlimits++;
709
			cbq_overlimit(this_cl);
L
Linus Torvalds 已提交
710 711 712 713
			return NULL;
		}
		if (cl->level > q->toplevel)
			return NULL;
714
	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
L
Linus Torvalds 已提交
715 716 717 718 719

	cl->delayed = 0;
	return cl;
}

E
Eric Dumazet 已提交
720
static inline struct sk_buff *
L
Linus Torvalds 已提交
721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743
cbq_dequeue_prio(struct Qdisc *sch, int prio)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
	struct cbq_class *cl_tail, *cl_prev, *cl;
	struct sk_buff *skb;
	int deficit;

	cl_tail = cl_prev = q->active[prio];
	cl = cl_prev->next_alive;

	do {
		deficit = 0;

		/* Start round */
		do {
			struct cbq_class *borrow = cl;

			if (cl->q->q.qlen &&
			    (borrow = cbq_under_limit(cl)) == NULL)
				goto skip_class;

			if (cl->deficit <= 0) {
				/* Class exhausted its allotment per
E
Eric Dumazet 已提交
744
				 * this round. Switch to the next one.
L
Linus Torvalds 已提交
745 746 747 748 749 750 751 752 753
				 */
				deficit = 1;
				cl->deficit += cl->quantum;
				goto next_class;
			}

			skb = cl->q->dequeue(cl->q);

			/* Class did not give us any skb :-(
E
Eric Dumazet 已提交
754 755
			 * It could occur even if cl->q->q.qlen != 0
			 * f.e. if cl->q == "tbf"
L
Linus Torvalds 已提交
756 757 758 759
			 */
			if (skb == NULL)
				goto skip_class;

760
			cl->deficit -= qdisc_pkt_len(skb);
L
Linus Torvalds 已提交
761 762 763 764 765 766 767
			q->tx_class = cl;
			q->tx_borrowed = borrow;
			if (borrow != cl) {
#ifndef CBQ_XSTATS_BORROWS_BYTES
				borrow->xstats.borrows++;
				cl->xstats.borrows++;
#else
768 769
				borrow->xstats.borrows += qdisc_pkt_len(skb);
				cl->xstats.borrows += qdisc_pkt_len(skb);
L
Linus Torvalds 已提交
770 771
#endif
			}
772
			q->tx_len = qdisc_pkt_len(skb);
L
Linus Torvalds 已提交
773 774 775 776 777 778 779 780 781 782 783

			if (cl->deficit <= 0) {
				q->active[prio] = cl;
				cl = cl->next_alive;
				cl->deficit += cl->quantum;
			}
			return skb;

skip_class:
			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
				/* Class is empty or penalized.
E
Eric Dumazet 已提交
784
				 * Unlink it from active chain.
L
Linus Torvalds 已提交
785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822
				 */
				cl_prev->next_alive = cl->next_alive;
				cl->next_alive = NULL;

				/* Did cl_tail point to it? */
				if (cl == cl_tail) {
					/* Repair it! */
					cl_tail = cl_prev;

					/* Was it the last class in this band? */
					if (cl == cl_tail) {
						/* Kill the band! */
						q->active[prio] = NULL;
						q->activemask &= ~(1<<prio);
						if (cl->q->q.qlen)
							cbq_activate_class(cl);
						return NULL;
					}

					q->active[prio] = cl_tail;
				}
				if (cl->q->q.qlen)
					cbq_activate_class(cl);

				cl = cl_prev;
			}

next_class:
			cl_prev = cl;
			cl = cl->next_alive;
		} while (cl_prev != cl_tail);
	} while (deficit);

	q->active[prio] = cl_prev;

	return NULL;
}

E
Eric Dumazet 已提交
823
static inline struct sk_buff *
L
Linus Torvalds 已提交
824 825 826 827
cbq_dequeue_1(struct Qdisc *sch)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
	struct sk_buff *skb;
E
Eric Dumazet 已提交
828
	unsigned int activemask;
L
Linus Torvalds 已提交
829

E
Eric Dumazet 已提交
830
	activemask = q->activemask & 0xFF;
L
Linus Torvalds 已提交
831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847
	while (activemask) {
		int prio = ffz(~activemask);
		activemask &= ~(1<<prio);
		skb = cbq_dequeue_prio(sch, prio);
		if (skb)
			return skb;
	}
	return NULL;
}

static struct sk_buff *
cbq_dequeue(struct Qdisc *sch)
{
	struct sk_buff *skb;
	struct cbq_sched_data *q = qdisc_priv(sch);
	psched_time_t now;

848
	now = psched_get_time();
849 850

	if (q->tx_class)
L
Linus Torvalds 已提交
851
		cbq_update(q);
852 853

	q->now = now;
L
Linus Torvalds 已提交
854 855 856 857 858 859

	for (;;) {
		q->wd_expires = 0;

		skb = cbq_dequeue_1(sch);
		if (skb) {
860
			qdisc_bstats_update(sch, skb);
L
Linus Torvalds 已提交
861
			sch->q.qlen--;
862
			qdisc_unthrottled(sch);
L
Linus Torvalds 已提交
863 864 865 866
			return skb;
		}

		/* All the classes are overlimit.
E
Eric Dumazet 已提交
867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882
		 *
		 * It is possible, if:
		 *
		 * 1. Scheduler is empty.
		 * 2. Toplevel cutoff inhibited borrowing.
		 * 3. Root class is overlimit.
		 *
		 * Reset 2d and 3d conditions and retry.
		 *
		 * Note, that NS and cbq-2.0 are buggy, peeking
		 * an arbitrary class is appropriate for ancestor-only
		 * sharing, but not for toplevel algorithm.
		 *
		 * Our version is better, but slower, because it requires
		 * two passes, but it is unavoidable with top-level sharing.
		 */
L
Linus Torvalds 已提交
883 884

		if (q->toplevel == TC_CBQ_MAXLEVEL &&
885
		    q->link.undertime == PSCHED_PASTPERFECT)
L
Linus Torvalds 已提交
886 887 888
			break;

		q->toplevel = TC_CBQ_MAXLEVEL;
889
		q->link.undertime = PSCHED_PASTPERFECT;
L
Linus Torvalds 已提交
890 891 892
	}

	/* No packets in scheduler or nobody wants to give them to us :-(
E
Eric Dumazet 已提交
893 894
	 * Sigh... start watchdog timer in the last case.
	 */
L
Linus Torvalds 已提交
895 896

	if (sch->q.qlen) {
897
		qdisc_qstats_overlimit(sch);
898 899
		if (q->wd_expires)
			qdisc_watchdog_schedule(&q->watchdog,
900
						now + q->wd_expires);
L
Linus Torvalds 已提交
901 902 903 904 905 906 907 908 909 910 911 912 913 914 915
	}
	return NULL;
}

/* CBQ class maintanance routines */

static void cbq_adjust_levels(struct cbq_class *this)
{
	if (this == NULL)
		return;

	do {
		int level = 0;
		struct cbq_class *cl;

E
Eric Dumazet 已提交
916 917
		cl = this->children;
		if (cl) {
L
Linus Torvalds 已提交
918 919 920 921 922
			do {
				if (cl->level > level)
					level = cl->level;
			} while ((cl = cl->sibling) != this->children);
		}
E
Eric Dumazet 已提交
923
		this->level = level + 1;
L
Linus Torvalds 已提交
924 925 926 927 928 929
	} while ((this = this->tparent) != NULL);
}

static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
{
	struct cbq_class *cl;
930
	unsigned int h;
L
Linus Torvalds 已提交
931 932 933 934

	if (q->quanta[prio] == 0)
		return;

935
	for (h = 0; h < q->clhash.hashsize; h++) {
936
		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
L
Linus Torvalds 已提交
937
			/* BUGGGG... Beware! This expression suffer of
E
Eric Dumazet 已提交
938
			 * arithmetic overflows!
L
Linus Torvalds 已提交
939 940 941 942 943
			 */
			if (cl->priority == prio) {
				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
					q->quanta[prio];
			}
944 945
			if (cl->quantum <= 0 ||
			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
946 947
				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
					cl->common.classid, cl->quantum);
948
				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
L
Linus Torvalds 已提交
949 950 951 952 953 954 955 956 957
			}
		}
	}
}

static void cbq_sync_defmap(struct cbq_class *cl)
{
	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
	struct cbq_class *split = cl->split;
E
Eric Dumazet 已提交
958
	unsigned int h;
L
Linus Torvalds 已提交
959 960 961 962 963
	int i;

	if (split == NULL)
		return;

E
Eric Dumazet 已提交
964 965
	for (i = 0; i <= TC_PRIO_MAX; i++) {
		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
L
Linus Torvalds 已提交
966 967 968
			split->defaults[i] = NULL;
	}

E
Eric Dumazet 已提交
969
	for (i = 0; i <= TC_PRIO_MAX; i++) {
L
Linus Torvalds 已提交
970 971 972 973 974
		int level = split->level;

		if (split->defaults[i])
			continue;

975
		for (h = 0; h < q->clhash.hashsize; h++) {
L
Linus Torvalds 已提交
976 977
			struct cbq_class *c;

978
			hlist_for_each_entry(c, &q->clhash.hash[h],
979
					     common.hnode) {
L
Linus Torvalds 已提交
980
				if (c->split == split && c->level < level &&
E
Eric Dumazet 已提交
981
				    c->defmap & (1<<i)) {
L
Linus Torvalds 已提交
982 983 984 985 986 987 988 989 990 991 992 993 994
					split->defaults[i] = c;
					level = c->level;
				}
			}
		}
	}
}

static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
{
	struct cbq_class *split = NULL;

	if (splitid == 0) {
E
Eric Dumazet 已提交
995 996
		split = cl->split;
		if (!split)
L
Linus Torvalds 已提交
997
			return;
998
		splitid = split->common.classid;
L
Linus Torvalds 已提交
999 1000
	}

1001
	if (split == NULL || split->common.classid != splitid) {
L
Linus Torvalds 已提交
1002
		for (split = cl->tparent; split; split = split->tparent)
1003
			if (split->common.classid == splitid)
L
Linus Torvalds 已提交
1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
				break;
	}

	if (split == NULL)
		return;

	if (cl->split != split) {
		cl->defmap = 0;
		cbq_sync_defmap(cl);
		cl->split = split;
E
Eric Dumazet 已提交
1014
		cl->defmap = def & mask;
L
Linus Torvalds 已提交
1015
	} else
E
Eric Dumazet 已提交
1016
		cl->defmap = (cl->defmap & ~mask) | (def & mask);
L
Linus Torvalds 已提交
1017 1018 1019 1020 1021 1022 1023 1024 1025

	cbq_sync_defmap(cl);
}

static void cbq_unlink_class(struct cbq_class *this)
{
	struct cbq_class *cl, **clp;
	struct cbq_sched_data *q = qdisc_priv(this->qdisc);

1026
	qdisc_class_hash_remove(&q->clhash, &this->common);
L
Linus Torvalds 已提交
1027 1028

	if (this->tparent) {
E
Eric Dumazet 已提交
1029
		clp = &this->sibling;
L
Linus Torvalds 已提交
1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044
		cl = *clp;
		do {
			if (cl == this) {
				*clp = cl->sibling;
				break;
			}
			clp = &cl->sibling;
		} while ((cl = *clp) != this->sibling);

		if (this->tparent->children == this) {
			this->tparent->children = this->sibling;
			if (this->sibling == this)
				this->tparent->children = NULL;
		}
	} else {
1045
		WARN_ON(this->sibling != this);
L
Linus Torvalds 已提交
1046 1047 1048 1049 1050 1051 1052 1053 1054
	}
}

static void cbq_link_class(struct cbq_class *this)
{
	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
	struct cbq_class *parent = this->tparent;

	this->sibling = this;
1055
	qdisc_class_hash_insert(&q->clhash, &this->common);
L
Linus Torvalds 已提交
1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067

	if (parent == NULL)
		return;

	if (parent->children == NULL) {
		parent->children = this;
	} else {
		this->sibling = parent->children->sibling;
		parent->children->sibling = this;
	}
}

E
Eric Dumazet 已提交
1068
static unsigned int cbq_drop(struct Qdisc *sch)
L
Linus Torvalds 已提交
1069 1070 1071 1072 1073 1074 1075
{
	struct cbq_sched_data *q = qdisc_priv(sch);
	struct cbq_class *cl, *cl_head;
	int prio;
	unsigned int len;

	for (prio = TC_CBQ_MAXPRIO; prio >= 0; prio--) {
E
Eric Dumazet 已提交
1076 1077
		cl_head = q->active[prio];
		if (!cl_head)
L
Linus Torvalds 已提交
1078 1079 1080 1081 1082 1083
			continue;

		cl = cl_head;
		do {
			if (cl->q->ops->drop && (len = cl->q->ops->drop(cl->q))) {
				sch->q.qlen--;
1084 1085
				if (!cl->q->q.qlen)
					cbq_deactivate_class(cl);
L
Linus Torvalds 已提交
1086 1087 1088 1089 1090 1091 1092 1093
				return len;
			}
		} while ((cl = cl->next_alive) != cl_head);
	}
	return 0;
}

static void
E
Eric Dumazet 已提交
1094
cbq_reset(struct Qdisc *sch)
L
Linus Torvalds 已提交
1095 1096 1097 1098
{
	struct cbq_sched_data *q = qdisc_priv(sch);
	struct cbq_class *cl;
	int prio;
E
Eric Dumazet 已提交
1099
	unsigned int h;
L
Linus Torvalds 已提交
1100 1101 1102 1103 1104

	q->activemask = 0;
	q->pmask = 0;
	q->tx_class = NULL;
	q->tx_borrowed = NULL;
1105
	qdisc_watchdog_cancel(&q->watchdog);
1106
	hrtimer_cancel(&q->delay_timer);
L
Linus Torvalds 已提交
1107
	q->toplevel = TC_CBQ_MAXLEVEL;
1108
	q->now = psched_get_time();
L
Linus Torvalds 已提交
1109 1110 1111 1112

	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
		q->active[prio] = NULL;

1113
	for (h = 0; h < q->clhash.hashsize; h++) {
1114
		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
L
Linus Torvalds 已提交
1115 1116 1117
			qdisc_reset(cl->q);

			cl->next_alive = NULL;
1118
			cl->undertime = PSCHED_PASTPERFECT;
L
Linus Torvalds 已提交
1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
			cl->avgidle = cl->maxidle;
			cl->deficit = cl->quantum;
			cl->cpriority = cl->priority;
		}
	}
	sch->q.qlen = 0;
}


static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
{
E
Eric Dumazet 已提交
1130 1131 1132
	if (lss->change & TCF_CBQ_LSS_FLAGS) {
		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
L
Linus Torvalds 已提交
1133
	}
E
Eric Dumazet 已提交
1134
	if (lss->change & TCF_CBQ_LSS_EWMA)
L
Linus Torvalds 已提交
1135
		cl->ewma_log = lss->ewma_log;
E
Eric Dumazet 已提交
1136
	if (lss->change & TCF_CBQ_LSS_AVPKT)
L
Linus Torvalds 已提交
1137
		cl->avpkt = lss->avpkt;
E
Eric Dumazet 已提交
1138
	if (lss->change & TCF_CBQ_LSS_MINIDLE)
L
Linus Torvalds 已提交
1139
		cl->minidle = -(long)lss->minidle;
E
Eric Dumazet 已提交
1140
	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
L
Linus Torvalds 已提交
1141 1142 1143
		cl->maxidle = lss->maxidle;
		cl->avgidle = lss->maxidle;
	}
E
Eric Dumazet 已提交
1144
	if (lss->change & TCF_CBQ_LSS_OFFTIME)
L
Linus Torvalds 已提交
1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171
		cl->offtime = lss->offtime;
	return 0;
}

static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
{
	q->nclasses[cl->priority]--;
	q->quanta[cl->priority] -= cl->weight;
	cbq_normalize_quanta(q, cl->priority);
}

static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
{
	q->nclasses[cl->priority]++;
	q->quanta[cl->priority] += cl->weight;
	cbq_normalize_quanta(q, cl->priority);
}

static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
{
	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);

	if (wrr->allot)
		cl->allot = wrr->allot;
	if (wrr->weight)
		cl->weight = wrr->weight;
	if (wrr->priority) {
E
Eric Dumazet 已提交
1172
		cl->priority = wrr->priority - 1;
L
Linus Torvalds 已提交
1173 1174
		cl->cpriority = cl->priority;
		if (cl->priority >= cl->priority2)
E
Eric Dumazet 已提交
1175
			cl->priority2 = TC_CBQ_MAXPRIO - 1;
L
Linus Torvalds 已提交
1176 1177 1178 1179 1180 1181
	}

	cbq_addprio(q, cl);
	return 0;
}

1182
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202
static int cbq_set_police(struct cbq_class *cl, struct tc_cbq_police *p)
{
	cl->police = p->police;

	if (cl->q->handle) {
		if (p->police == TC_POLICE_RECLASSIFY)
			cl->q->reshape_fail = cbq_reshape_fail;
		else
			cl->q->reshape_fail = NULL;
	}
	return 0;
}
#endif

static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
{
	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
	return 0;
}

1203 1204 1205 1206 1207 1208 1209 1210 1211 1212
static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
};

1213
static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
L
Linus Torvalds 已提交
1214 1215
{
	struct cbq_sched_data *q = qdisc_priv(sch);
1216
	struct nlattr *tb[TCA_CBQ_MAX + 1];
L
Linus Torvalds 已提交
1217
	struct tc_ratespec *r;
1218 1219
	int err;

1220
	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1221 1222
	if (err < 0)
		return err;
L
Linus Torvalds 已提交
1223

1224
	if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
L
Linus Torvalds 已提交
1225 1226
		return -EINVAL;

1227
	r = nla_data(tb[TCA_CBQ_RATE]);
L
Linus Torvalds 已提交
1228

1229
	if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
L
Linus Torvalds 已提交
1230 1231
		return -EINVAL;

1232 1233 1234 1235
	err = qdisc_class_hash_init(&q->clhash);
	if (err < 0)
		goto put_rtab;

L
Linus Torvalds 已提交
1236 1237
	q->link.refcnt = 1;
	q->link.sibling = &q->link;
1238
	q->link.common.classid = sch->handle;
L
Linus Torvalds 已提交
1239
	q->link.qdisc = sch;
1240 1241 1242
	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
				      sch->handle);
	if (!q->link.q)
L
Linus Torvalds 已提交
1243 1244
		q->link.q = &noop_qdisc;

E
Eric Dumazet 已提交
1245 1246 1247
	q->link.priority = TC_CBQ_MAXPRIO - 1;
	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1248
	q->link.allot = psched_mtu(qdisc_dev(sch));
L
Linus Torvalds 已提交
1249 1250 1251 1252 1253 1254 1255
	q->link.quantum = q->link.allot;
	q->link.weight = q->link.R_tab->rate.rate;

	q->link.ewma_log = TC_CBQ_DEF_EWMA;
	q->link.avpkt = q->link.allot/2;
	q->link.minidle = -0x7FFFFFFF;

1256
	qdisc_watchdog_init(&q->watchdog, sch);
E
Eric Dumazet 已提交
1257
	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
L
Linus Torvalds 已提交
1258 1259
	q->delay_timer.function = cbq_undelay;
	q->toplevel = TC_CBQ_MAXLEVEL;
1260
	q->now = psched_get_time();
L
Linus Torvalds 已提交
1261 1262 1263

	cbq_link_class(&q->link);

1264 1265
	if (tb[TCA_CBQ_LSSOPT])
		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
L
Linus Torvalds 已提交
1266 1267 1268

	cbq_addprio(q, &q->link);
	return 0;
1269 1270 1271 1272

put_rtab:
	qdisc_put_rtab(q->link.R_tab);
	return err;
L
Linus Torvalds 已提交
1273 1274
}

E
Eric Dumazet 已提交
1275
static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
L
Linus Torvalds 已提交
1276
{
1277
	unsigned char *b = skb_tail_pointer(skb);
L
Linus Torvalds 已提交
1278

1279 1280
	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
		goto nla_put_failure;
L
Linus Torvalds 已提交
1281 1282
	return skb->len;

1283
nla_put_failure:
1284
	nlmsg_trim(skb, b);
L
Linus Torvalds 已提交
1285 1286 1287
	return -1;
}

E
Eric Dumazet 已提交
1288
static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
L
Linus Torvalds 已提交
1289
{
1290
	unsigned char *b = skb_tail_pointer(skb);
L
Linus Torvalds 已提交
1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304
	struct tc_cbq_lssopt opt;

	opt.flags = 0;
	if (cl->borrow == NULL)
		opt.flags |= TCF_CBQ_LSS_BOUNDED;
	if (cl->share == NULL)
		opt.flags |= TCF_CBQ_LSS_ISOLATED;
	opt.ewma_log = cl->ewma_log;
	opt.level = cl->level;
	opt.avpkt = cl->avpkt;
	opt.maxidle = cl->maxidle;
	opt.minidle = (u32)(-cl->minidle);
	opt.offtime = cl->offtime;
	opt.change = ~0;
1305 1306
	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
		goto nla_put_failure;
L
Linus Torvalds 已提交
1307 1308
	return skb->len;

1309
nla_put_failure:
1310
	nlmsg_trim(skb, b);
L
Linus Torvalds 已提交
1311 1312 1313
	return -1;
}

E
Eric Dumazet 已提交
1314
static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
L
Linus Torvalds 已提交
1315
{
1316
	unsigned char *b = skb_tail_pointer(skb);
L
Linus Torvalds 已提交
1317 1318
	struct tc_cbq_wrropt opt;

1319
	memset(&opt, 0, sizeof(opt));
L
Linus Torvalds 已提交
1320 1321
	opt.flags = 0;
	opt.allot = cl->allot;
E
Eric Dumazet 已提交
1322 1323
	opt.priority = cl->priority + 1;
	opt.cpriority = cl->cpriority + 1;
L
Linus Torvalds 已提交
1324
	opt.weight = cl->weight;
1325 1326
	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
		goto nla_put_failure;
L
Linus Torvalds 已提交
1327 1328
	return skb->len;

1329
nla_put_failure:
1330
	nlmsg_trim(skb, b);
L
Linus Torvalds 已提交
1331 1332 1333
	return -1;
}

E
Eric Dumazet 已提交
1334
static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
L
Linus Torvalds 已提交
1335
{
1336
	unsigned char *b = skb_tail_pointer(skb);
L
Linus Torvalds 已提交
1337 1338 1339
	struct tc_cbq_fopt opt;

	if (cl->split || cl->defmap) {
1340
		opt.split = cl->split ? cl->split->common.classid : 0;
L
Linus Torvalds 已提交
1341 1342
		opt.defmap = cl->defmap;
		opt.defchange = ~0;
1343 1344
		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
			goto nla_put_failure;
L
Linus Torvalds 已提交
1345 1346 1347
	}
	return skb->len;

1348
nla_put_failure:
1349
	nlmsg_trim(skb, b);
L
Linus Torvalds 已提交
1350 1351 1352
	return -1;
}

1353
#ifdef CONFIG_NET_CLS_ACT
E
Eric Dumazet 已提交
1354
static int cbq_dump_police(struct sk_buff *skb, struct cbq_class *cl)
L
Linus Torvalds 已提交
1355
{
1356
	unsigned char *b = skb_tail_pointer(skb);
L
Linus Torvalds 已提交
1357 1358 1359 1360
	struct tc_cbq_police opt;

	if (cl->police) {
		opt.police = cl->police;
1361 1362
		opt.__res1 = 0;
		opt.__res2 = 0;
1363 1364
		if (nla_put(skb, TCA_CBQ_POLICE, sizeof(opt), &opt))
			goto nla_put_failure;
L
Linus Torvalds 已提交
1365 1366 1367
	}
	return skb->len;

1368
nla_put_failure:
1369
	nlmsg_trim(skb, b);
L
Linus Torvalds 已提交
1370 1371 1372 1373 1374 1375 1376 1377 1378
	return -1;
}
#endif

static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
{
	if (cbq_dump_lss(skb, cl) < 0 ||
	    cbq_dump_rate(skb, cl) < 0 ||
	    cbq_dump_wrr(skb, cl) < 0 ||
1379
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
1380 1381 1382 1383 1384 1385 1386 1387 1388 1389
	    cbq_dump_police(skb, cl) < 0 ||
#endif
	    cbq_dump_fopt(skb, cl) < 0)
		return -1;
	return 0;
}

static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
1390
	struct nlattr *nest;
L
Linus Torvalds 已提交
1391

1392 1393 1394
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;
L
Linus Torvalds 已提交
1395
	if (cbq_dump_attr(skb, &q->link) < 0)
1396
		goto nla_put_failure;
1397
	return nla_nest_end(skb, nest);
L
Linus Torvalds 已提交
1398

1399
nla_put_failure:
1400
	nla_nest_cancel(skb, nest);
L
Linus Torvalds 已提交
1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416
	return -1;
}

static int
cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
{
	struct cbq_sched_data *q = qdisc_priv(sch);

	q->link.xstats.avgidle = q->link.avgidle;
	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
}

static int
cbq_dump_class(struct Qdisc *sch, unsigned long arg,
	       struct sk_buff *skb, struct tcmsg *tcm)
{
E
Eric Dumazet 已提交
1417
	struct cbq_class *cl = (struct cbq_class *)arg;
1418
	struct nlattr *nest;
L
Linus Torvalds 已提交
1419 1420

	if (cl->tparent)
1421
		tcm->tcm_parent = cl->tparent->common.classid;
L
Linus Torvalds 已提交
1422 1423
	else
		tcm->tcm_parent = TC_H_ROOT;
1424
	tcm->tcm_handle = cl->common.classid;
L
Linus Torvalds 已提交
1425 1426
	tcm->tcm_info = cl->q->handle;

1427 1428 1429
	nest = nla_nest_start(skb, TCA_OPTIONS);
	if (nest == NULL)
		goto nla_put_failure;
L
Linus Torvalds 已提交
1430
	if (cbq_dump_attr(skb, cl) < 0)
1431
		goto nla_put_failure;
1432
	return nla_nest_end(skb, nest);
L
Linus Torvalds 已提交
1433

1434
nla_put_failure:
1435
	nla_nest_cancel(skb, nest);
L
Linus Torvalds 已提交
1436 1437 1438 1439 1440 1441 1442 1443
	return -1;
}

static int
cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
	struct gnet_dump *d)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
E
Eric Dumazet 已提交
1444
	struct cbq_class *cl = (struct cbq_class *)arg;
L
Linus Torvalds 已提交
1445 1446 1447 1448

	cl->xstats.avgidle = cl->avgidle;
	cl->xstats.undertime = 0;

1449
	if (cl->undertime != PSCHED_PASTPERFECT)
P
Patrick McHardy 已提交
1450
		cl->xstats.undertime = cl->undertime - q->now;
L
Linus Torvalds 已提交
1451

1452 1453
	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
				  d, NULL, &cl->bstats) < 0 ||
1454
	    gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
1455
	    gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
L
Linus Torvalds 已提交
1456 1457 1458 1459 1460 1461 1462 1463
		return -1;

	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
}

static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
		     struct Qdisc **old)
{
E
Eric Dumazet 已提交
1464
	struct cbq_class *cl = (struct cbq_class *)arg;
L
Linus Torvalds 已提交
1465

1466
	if (new == NULL) {
1467
		new = qdisc_create_dflt(sch->dev_queue,
1468 1469 1470 1471
					&pfifo_qdisc_ops, cl->common.classid);
		if (new == NULL)
			return -ENOBUFS;
	} else {
1472
#ifdef CONFIG_NET_CLS_ACT
1473 1474
		if (cl->police == TC_POLICE_RECLASSIFY)
			new->reshape_fail = cbq_reshape_fail;
L
Linus Torvalds 已提交
1475 1476
#endif
	}
1477

1478
	*old = qdisc_replace(sch, new, &cl->q);
1479
	return 0;
L
Linus Torvalds 已提交
1480 1481
}

E
Eric Dumazet 已提交
1482
static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
L
Linus Torvalds 已提交
1483
{
E
Eric Dumazet 已提交
1484
	struct cbq_class *cl = (struct cbq_class *)arg;
L
Linus Torvalds 已提交
1485

1486
	return cl->q;
L
Linus Torvalds 已提交
1487 1488
}

1489 1490 1491 1492 1493 1494 1495 1496
static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
{
	struct cbq_class *cl = (struct cbq_class *)arg;

	if (cl->q->q.qlen == 0)
		cbq_deactivate_class(cl);
}

L
Linus Torvalds 已提交
1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512
static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
	struct cbq_class *cl = cbq_class_lookup(q, classid);

	if (cl) {
		cl->refcnt++;
		return (unsigned long)cl;
	}
	return 0;
}

static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
{
	struct cbq_sched_data *q = qdisc_priv(sch);

1513
	WARN_ON(cl->filters);
L
Linus Torvalds 已提交
1514

1515
	tcf_destroy_chain(&cl->filter_list);
L
Linus Torvalds 已提交
1516 1517 1518 1519 1520 1521 1522
	qdisc_destroy(cl->q);
	qdisc_put_rtab(cl->R_tab);
	gen_kill_estimator(&cl->bstats, &cl->rate_est);
	if (cl != &q->link)
		kfree(cl);
}

E
Eric Dumazet 已提交
1523
static void cbq_destroy(struct Qdisc *sch)
L
Linus Torvalds 已提交
1524 1525
{
	struct cbq_sched_data *q = qdisc_priv(sch);
1526
	struct hlist_node *next;
L
Linus Torvalds 已提交
1527
	struct cbq_class *cl;
E
Eric Dumazet 已提交
1528
	unsigned int h;
L
Linus Torvalds 已提交
1529

1530
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
1531 1532 1533 1534 1535 1536 1537
	q->rx_class = NULL;
#endif
	/*
	 * Filters must be destroyed first because we don't destroy the
	 * classes from root to leafs which means that filters can still
	 * be bound to classes which have been destroyed already. --TGR '04
	 */
1538
	for (h = 0; h < q->clhash.hashsize; h++) {
1539
		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
1540
			tcf_destroy_chain(&cl->filter_list);
1541
	}
1542
	for (h = 0; h < q->clhash.hashsize; h++) {
1543
		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1544
					  common.hnode)
L
Linus Torvalds 已提交
1545 1546
			cbq_destroy_class(sch, cl);
	}
1547
	qdisc_class_hash_destroy(&q->clhash);
L
Linus Torvalds 已提交
1548 1549 1550 1551
}

static void cbq_put(struct Qdisc *sch, unsigned long arg)
{
E
Eric Dumazet 已提交
1552
	struct cbq_class *cl = (struct cbq_class *)arg;
L
Linus Torvalds 已提交
1553 1554

	if (--cl->refcnt == 0) {
1555
#ifdef CONFIG_NET_CLS_ACT
1556
		spinlock_t *root_lock = qdisc_root_sleeping_lock(sch);
L
Linus Torvalds 已提交
1557 1558
		struct cbq_sched_data *q = qdisc_priv(sch);

1559
		spin_lock_bh(root_lock);
L
Linus Torvalds 已提交
1560 1561
		if (q->rx_class == cl)
			q->rx_class = NULL;
1562
		spin_unlock_bh(root_lock);
L
Linus Torvalds 已提交
1563 1564 1565 1566 1567 1568 1569
#endif

		cbq_destroy_class(sch, cl);
	}
}

static int
1570
cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
L
Linus Torvalds 已提交
1571 1572 1573 1574
		 unsigned long *arg)
{
	int err;
	struct cbq_sched_data *q = qdisc_priv(sch);
E
Eric Dumazet 已提交
1575
	struct cbq_class *cl = (struct cbq_class *)*arg;
1576 1577
	struct nlattr *opt = tca[TCA_OPTIONS];
	struct nlattr *tb[TCA_CBQ_MAX + 1];
L
Linus Torvalds 已提交
1578 1579 1580
	struct cbq_class *parent;
	struct qdisc_rate_table *rtab = NULL;

1581
	if (opt == NULL)
L
Linus Torvalds 已提交
1582 1583
		return -EINVAL;

1584
	err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy);
1585 1586 1587
	if (err < 0)
		return err;

1588 1589 1590
	if (tb[TCA_CBQ_OVL_STRATEGY])
		return -EOPNOTSUPP;

L
Linus Torvalds 已提交
1591 1592 1593
	if (cl) {
		/* Check parent */
		if (parentid) {
1594 1595
			if (cl->tparent &&
			    cl->tparent->common.classid != parentid)
L
Linus Torvalds 已提交
1596 1597 1598 1599 1600
				return -EINVAL;
			if (!cl->tparent && parentid != TC_H_ROOT)
				return -EINVAL;
		}

1601
		if (tb[TCA_CBQ_RATE]) {
1602 1603
			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
					      tb[TCA_CBQ_RTAB]);
L
Linus Torvalds 已提交
1604 1605 1606 1607
			if (rtab == NULL)
				return -EINVAL;
		}

1608
		if (tca[TCA_RATE]) {
1609 1610
			err = gen_replace_estimator(&cl->bstats, NULL,
						    &cl->rate_est,
1611 1612
						    NULL,
						    qdisc_root_sleeping_running(sch),
1613 1614
						    tca[TCA_RATE]);
			if (err) {
1615
				qdisc_put_rtab(rtab);
1616 1617 1618 1619
				return err;
			}
		}

L
Linus Torvalds 已提交
1620 1621 1622 1623 1624 1625 1626
		/* Change class parameters */
		sch_tree_lock(sch);

		if (cl->next_alive != NULL)
			cbq_deactivate_class(cl);

		if (rtab) {
1627 1628
			qdisc_put_rtab(cl->R_tab);
			cl->R_tab = rtab;
L
Linus Torvalds 已提交
1629 1630
		}

1631 1632
		if (tb[TCA_CBQ_LSSOPT])
			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
L
Linus Torvalds 已提交
1633

1634
		if (tb[TCA_CBQ_WRROPT]) {
L
Linus Torvalds 已提交
1635
			cbq_rmprio(q, cl);
1636
			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
L
Linus Torvalds 已提交
1637 1638
		}

1639
#ifdef CONFIG_NET_CLS_ACT
1640 1641
		if (tb[TCA_CBQ_POLICE])
			cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
L
Linus Torvalds 已提交
1642 1643
#endif

1644 1645
		if (tb[TCA_CBQ_FOPT])
			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
L
Linus Torvalds 已提交
1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657

		if (cl->q->q.qlen)
			cbq_activate_class(cl);

		sch_tree_unlock(sch);

		return 0;
	}

	if (parentid == TC_H_ROOT)
		return -EINVAL;

1658 1659
	if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
	    tb[TCA_CBQ_LSSOPT] == NULL)
L
Linus Torvalds 已提交
1660 1661
		return -EINVAL;

1662
	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
L
Linus Torvalds 已提交
1663 1664 1665 1666 1667
	if (rtab == NULL)
		return -EINVAL;

	if (classid) {
		err = -EINVAL;
E
Eric Dumazet 已提交
1668 1669
		if (TC_H_MAJ(classid ^ sch->handle) ||
		    cbq_class_lookup(q, classid))
L
Linus Torvalds 已提交
1670 1671 1672
			goto failure;
	} else {
		int i;
E
Eric Dumazet 已提交
1673
		classid = TC_H_MAKE(sch->handle, 0x8000);
L
Linus Torvalds 已提交
1674

E
Eric Dumazet 已提交
1675
		for (i = 0; i < 0x8000; i++) {
L
Linus Torvalds 已提交
1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691 1692 1693 1694 1695
			if (++q->hgenerator >= 0x8000)
				q->hgenerator = 1;
			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
				break;
		}
		err = -ENOSR;
		if (i >= 0x8000)
			goto failure;
		classid = classid|q->hgenerator;
	}

	parent = &q->link;
	if (parentid) {
		parent = cbq_class_lookup(q, parentid);
		err = -EINVAL;
		if (parent == NULL)
			goto failure;
	}

	err = -ENOBUFS;
1696
	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
L
Linus Torvalds 已提交
1697 1698
	if (cl == NULL)
		goto failure;
1699 1700

	if (tca[TCA_RATE]) {
1701
		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1702 1703
					NULL,
					qdisc_root_sleeping_running(sch),
1704 1705 1706 1707 1708 1709 1710
					tca[TCA_RATE]);
		if (err) {
			kfree(cl);
			goto failure;
		}
	}

L
Linus Torvalds 已提交
1711 1712 1713
	cl->R_tab = rtab;
	rtab = NULL;
	cl->refcnt = 1;
1714 1715
	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
	if (!cl->q)
L
Linus Torvalds 已提交
1716
		cl->q = &noop_qdisc;
1717
	cl->common.classid = classid;
L
Linus Torvalds 已提交
1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730
	cl->tparent = parent;
	cl->qdisc = sch;
	cl->allot = parent->allot;
	cl->quantum = cl->allot;
	cl->weight = cl->R_tab->rate.rate;

	sch_tree_lock(sch);
	cbq_link_class(cl);
	cl->borrow = cl->tparent;
	if (cl->tparent != &q->link)
		cl->share = cl->tparent;
	cbq_adjust_levels(parent);
	cl->minidle = -0x7FFFFFFF;
1731 1732
	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
E
Eric Dumazet 已提交
1733
	if (cl->ewma_log == 0)
L
Linus Torvalds 已提交
1734
		cl->ewma_log = q->link.ewma_log;
E
Eric Dumazet 已提交
1735
	if (cl->maxidle == 0)
L
Linus Torvalds 已提交
1736
		cl->maxidle = q->link.maxidle;
E
Eric Dumazet 已提交
1737
	if (cl->avpkt == 0)
L
Linus Torvalds 已提交
1738
		cl->avpkt = q->link.avpkt;
1739
#ifdef CONFIG_NET_CLS_ACT
1740 1741
	if (tb[TCA_CBQ_POLICE])
		cbq_set_police(cl, nla_data(tb[TCA_CBQ_POLICE]));
L
Linus Torvalds 已提交
1742
#endif
1743 1744
	if (tb[TCA_CBQ_FOPT])
		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
L
Linus Torvalds 已提交
1745 1746
	sch_tree_unlock(sch);

1747 1748
	qdisc_class_hash_grow(sch, &q->clhash);

L
Linus Torvalds 已提交
1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759
	*arg = (unsigned long)cl;
	return 0;

failure:
	qdisc_put_rtab(rtab);
	return err;
}

static int cbq_delete(struct Qdisc *sch, unsigned long arg)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
E
Eric Dumazet 已提交
1760
	struct cbq_class *cl = (struct cbq_class *)arg;
1761
	unsigned int qlen, backlog;
L
Linus Torvalds 已提交
1762 1763 1764 1765 1766 1767

	if (cl->filters || cl->children || cl == &q->link)
		return -EBUSY;

	sch_tree_lock(sch);

1768
	qlen = cl->q->q.qlen;
1769
	backlog = cl->q->qstats.backlog;
1770
	qdisc_reset(cl->q);
1771
	qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1772

L
Linus Torvalds 已提交
1773 1774 1775 1776 1777 1778 1779 1780 1781
	if (cl->next_alive)
		cbq_deactivate_class(cl);

	if (q->tx_borrowed == cl)
		q->tx_borrowed = q->tx_class;
	if (q->tx_class == cl) {
		q->tx_class = NULL;
		q->tx_borrowed = NULL;
	}
1782
#ifdef CONFIG_NET_CLS_ACT
L
Linus Torvalds 已提交
1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794
	if (q->rx_class == cl)
		q->rx_class = NULL;
#endif

	cbq_unlink_class(cl);
	cbq_adjust_levels(cl->tparent);
	cl->defmap = 0;
	cbq_sync_defmap(cl);

	cbq_rmprio(q, cl);
	sch_tree_unlock(sch);

1795 1796 1797 1798 1799
	BUG_ON(--cl->refcnt == 0);
	/*
	 * This shouldn't happen: we "hold" one cops->get() when called
	 * from tc_ctl_tclass; the destroy method is done from cops->put().
	 */
L
Linus Torvalds 已提交
1800 1801 1802 1803

	return 0;
}

J
John Fastabend 已提交
1804 1805
static struct tcf_proto __rcu **cbq_find_tcf(struct Qdisc *sch,
					     unsigned long arg)
L
Linus Torvalds 已提交
1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819
{
	struct cbq_sched_data *q = qdisc_priv(sch);
	struct cbq_class *cl = (struct cbq_class *)arg;

	if (cl == NULL)
		cl = &q->link;

	return &cl->filter_list;
}

static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
				     u32 classid)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
E
Eric Dumazet 已提交
1820
	struct cbq_class *p = (struct cbq_class *)parent;
L
Linus Torvalds 已提交
1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833
	struct cbq_class *cl = cbq_class_lookup(q, classid);

	if (cl) {
		if (p && p->level <= cl->level)
			return 0;
		cl->filters++;
		return (unsigned long)cl;
	}
	return 0;
}

static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
{
E
Eric Dumazet 已提交
1834
	struct cbq_class *cl = (struct cbq_class *)arg;
L
Linus Torvalds 已提交
1835 1836 1837 1838 1839 1840 1841

	cl->filters--;
}

static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
{
	struct cbq_sched_data *q = qdisc_priv(sch);
1842
	struct cbq_class *cl;
E
Eric Dumazet 已提交
1843
	unsigned int h;
L
Linus Torvalds 已提交
1844 1845 1846 1847

	if (arg->stop)
		return;

1848
	for (h = 0; h < q->clhash.hashsize; h++) {
1849
		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
L
Linus Torvalds 已提交
1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862
			if (arg->count < arg->skip) {
				arg->count++;
				continue;
			}
			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
				arg->stop = 1;
				return;
			}
			arg->count++;
		}
	}
}

1863
static const struct Qdisc_class_ops cbq_class_ops = {
L
Linus Torvalds 已提交
1864 1865
	.graft		=	cbq_graft,
	.leaf		=	cbq_leaf,
1866
	.qlen_notify	=	cbq_qlen_notify,
L
Linus Torvalds 已提交
1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878
	.get		=	cbq_get,
	.put		=	cbq_put,
	.change		=	cbq_change_class,
	.delete		=	cbq_delete,
	.walk		=	cbq_walk,
	.tcf_chain	=	cbq_find_tcf,
	.bind_tcf	=	cbq_bind_filter,
	.unbind_tcf	=	cbq_unbind_filter,
	.dump		=	cbq_dump_class,
	.dump_stats	=	cbq_dump_class_stats,
};

1879
static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
1880 1881 1882 1883 1884 1885
	.next		=	NULL,
	.cl_ops		=	&cbq_class_ops,
	.id		=	"cbq",
	.priv_size	=	sizeof(struct cbq_sched_data),
	.enqueue	=	cbq_enqueue,
	.dequeue	=	cbq_dequeue,
1886
	.peek		=	qdisc_peek_dequeued,
L
Linus Torvalds 已提交
1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900
	.drop		=	cbq_drop,
	.init		=	cbq_init,
	.reset		=	cbq_reset,
	.destroy	=	cbq_destroy,
	.change		=	NULL,
	.dump		=	cbq_dump,
	.dump_stats	=	cbq_dump_stats,
	.owner		=	THIS_MODULE,
};

static int __init cbq_module_init(void)
{
	return register_qdisc(&cbq_qdisc_ops);
}
1901
static void __exit cbq_module_exit(void)
L
Linus Torvalds 已提交
1902 1903 1904 1905 1906 1907
{
	unregister_qdisc(&cbq_qdisc_ops);
}
module_init(cbq_module_init)
module_exit(cbq_module_exit)
MODULE_LICENSE("GPL");