sch_generic.c 25.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
30
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
31
#include <net/dst.h>
32
#include <trace/events/qdisc.h>
L
Linus Torvalds 已提交
33

34 35 36 37
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
38 39
/* Main transmission queue. */

40
/* Modifications to data participating in scheduling must be protected with
41
 * qdisc_lock(qdisc) spinlock.
42 43
 *
 * The idea is the following:
44 45
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
46
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
47 48
 */

49
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
50
{
51
	q->gso_skb = skb;
52
	q->qstats.requeues++;
53
	qdisc_qstats_backlog_inc(q, skb);
54
	q->q.qlen++;	/* it's still part of the queue */
55
	__netif_schedule(q);
56

57 58 59
	return 0;
}

60 61
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
62 63
				 const struct netdev_queue *txq,
				 int *packets)
64
{
65
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
66 67

	while (bytelimit > 0) {
68
		struct sk_buff *nskb = q->dequeue(q);
69

70
		if (!nskb)
71 72
			break;

73 74 75
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
76
		(*packets)++; /* GSO counts as one pkt */
77
	}
78
	skb->next = NULL;
79 80
}

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
			q->skb_bad_txq = nskb;
			qdisc_qstats_backlog_inc(q, nskb);
			q->q.qlen++;
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
	skb->next = NULL;
}

109 110 111
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
112 113
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
114
{
115
	struct sk_buff *skb = q->gso_skb;
116
	const struct netdev_queue *txq = q->dev_queue;
117

118
	*packets = 1;
119
	if (unlikely(skb)) {
120 121
		/* skb in gso_skb were already validated */
		*validate = false;
122
		/* check the reason of requeuing without tx lock first */
123
		txq = skb_get_tx_queue(txq->dev, skb);
124
		if (!netif_xmit_frozen_or_stopped(txq)) {
125
			q->gso_skb = NULL;
126
			qdisc_qstats_backlog_dec(q, skb);
127 128
			q->q.qlen--;
		} else
129
			skb = NULL;
130
		goto trace;
131 132 133 134 135 136 137 138 139 140 141
	}
	*validate = true;
	skb = q->skb_bad_txq;
	if (unlikely(skb)) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			q->skb_bad_txq = NULL;
			qdisc_qstats_backlog_dec(q, skb);
			q->q.qlen--;
			goto bulk;
142
		}
143 144
		skb = NULL;
		goto trace;
145 146 147 148 149 150 151 152 153 154
	}
	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
	    !netif_xmit_frozen_or_stopped(txq))
		skb = q->dequeue(q);
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
155
	}
156 157
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
158 159 160
	return skb;
}

161
/*
162
 * Transmit possibly several skbs, and handle the return status as
163
 * required. Owning running seqcount bit guarantees that
164
 * only one CPU can execute this function.
165 166 167 168 169
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 */
170 171
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		    struct net_device *dev, struct netdev_queue *txq,
172
		    spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
173
{
174
	int ret = NETDEV_TX_BUSY;
175 176 177

	/* And release qdisc */
	spin_unlock(root_lock);
178

179 180 181
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
		skb = validate_xmit_skb_list(skb, dev);
182

183
	if (likely(skb)) {
184 185 186
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
187

188
		HARD_TX_UNLOCK(dev, txq);
189
	} else {
190
		spin_lock(root_lock);
191
		return qdisc_qlen(q);
192
	}
193
	spin_lock(root_lock);
194

195 196
	if (dev_xmit_complete(ret)) {
		/* Driver sent out skb successfully or skb was consumed */
197
		ret = qdisc_qlen(q);
198
	} else {
199
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
200 201 202
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
203

204
		ret = dev_requeue_skb(skb, q);
205
	}
206

207
	if (ret && netif_xmit_frozen_or_stopped(txq))
208 209
		ret = 0;

210
	return ret;
L
Linus Torvalds 已提交
211 212
}

213 214 215
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
216
 * running seqcount guarantees only one CPU can process
217 218 219 220 221 222 223 224 225 226 227 228 229 230 231
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
232
static inline int qdisc_restart(struct Qdisc *q, int *packets)
233 234 235 236 237
{
	struct netdev_queue *txq;
	struct net_device *dev;
	spinlock_t *root_lock;
	struct sk_buff *skb;
238
	bool validate;
239 240

	/* Dequeue packet */
241
	skb = dequeue_skb(q, &validate, packets);
242 243
	if (unlikely(!skb))
		return 0;
244

245 246
	root_lock = qdisc_lock(q);
	dev = qdisc_dev(q);
247
	txq = skb_get_tx_queue(dev, skb);
248

249
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
250 251
}

252
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
253
{
254
	int quota = dev_tx_weight;
255
	int packets;
256

257
	while (qdisc_restart(q, &packets)) {
258
		/*
J
jamal 已提交
259 260 261
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
262
		 */
263 264
		quota -= packets;
		if (quota <= 0 || need_resched()) {
265
			__netif_schedule(q);
266
			break;
267 268
		}
	}
H
Herbert Xu 已提交
269

270
	qdisc_run_end(q);
H
Herbert Xu 已提交
271 272
}

273 274
unsigned long dev_trans_start(struct net_device *dev)
{
275
	unsigned long val, res;
276 277
	unsigned int i;

278 279
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
F
Florian Westphal 已提交
280 281
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
282 283 284 285
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
286

287 288 289 290
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

291
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
292
{
293
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
294

H
Herbert Xu 已提交
295
	netif_tx_lock(dev);
296
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
297 298 299
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
300
			int some_queue_timedout = 0;
301
			unsigned int i;
302
			unsigned long trans_start;
303 304 305 306 307

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
308
				trans_start = txq->trans_start;
309
				if (netif_xmit_stopped(txq) &&
310 311 312
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
313
					txq->trans_timeout++;
314 315 316
					break;
				}
			}
317

318 319
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
320
				       dev->name, netdev_drivername(dev), i);
321
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
322
			}
323 324 325
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
326 327 328
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
329
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
330 331 332 333 334 335

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
336
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
337 338
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
339 340
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
341 342 343 344 345 346 347 348 349 350 351
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
352
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
353
	if (del_timer(&dev->watchdog_timer))
354
		dev_put(dev);
H
Herbert Xu 已提交
355
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
356 357
}

358 359 360 361 362 363
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
364 365
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
366
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
367 368
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
369
		atomic_inc(&dev->carrier_changes);
370
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
371 372 373
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
374
}
375
EXPORT_SYMBOL(netif_carrier_on);
376

377 378 379 380 381 382
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
383 384
void netif_carrier_off(struct net_device *dev)
{
385 386 387
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
388
		atomic_inc(&dev->carrier_changes);
389
		linkwatch_fire_event(dev);
390
	}
391
}
392
EXPORT_SYMBOL(netif_carrier_off);
393

L
Linus Torvalds 已提交
394 395 396 397 398
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

399 400
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
401
{
402
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
403 404 405
	return NET_XMIT_CN;
}

406
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
407 408 409 410
{
	return NULL;
}

411
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
412 413 414 415
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
416
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
417 418 419
	.owner		=	THIS_MODULE,
};

420 421
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
422
	.qdisc_sleeping	=	&noop_qdisc,
423 424
};

L
Linus Torvalds 已提交
425 426 427 428
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
429
	.ops		=	&noop_qdisc_ops,
430
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
431
	.dev_queue	=	&noop_netdev_queue,
432
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
433
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
434
};
435
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
436

P
Phil Sutter 已提交
437 438 439 440 441 442 443 444 445 446
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
447 448
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
449
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
450 451
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
452
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
453 454 455
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
456 457 458
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
459 460 461 462 463 464 465

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

466 467 468 469 470 471 472
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
473
	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
474 475 476 477 478 479 480 481 482 483
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

484
static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
485
					     int band)
486
{
487
	return priv->q + band;
488 489
}

490 491
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
492
{
493
	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
494 495
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
496
		struct qdisc_skb_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
497

498
		priv->bitmap |= (1 << band);
499
		qdisc->q.qlen++;
500
		return __qdisc_enqueue_tail(skb, qdisc, list);
501
	}
502

503
	return qdisc_drop(skb, qdisc, to_free);
L
Linus Torvalds 已提交
504 505
}

E
Eric Dumazet 已提交
506
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
507
{
508 509
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
510

511
	if (likely(band >= 0)) {
512 513
		struct qdisc_skb_head *qh = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qh);
514 515 516 517 518

		if (likely(skb != NULL)) {
			qdisc_qstats_backlog_dec(qdisc, skb);
			qdisc_bstats_update(qdisc, skb);
		}
519 520

		qdisc->q.qlen--;
521
		if (qh->qlen == 0)
522 523 524
			priv->bitmap &= ~(1 << band);

		return skb;
525
	}
526

L
Linus Torvalds 已提交
527 528 529
	return NULL;
}

E
Eric Dumazet 已提交
530
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
531
{
532 533 534 535
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
536
		struct qdisc_skb_head *qh = band2list(priv, band);
537

538
		return qh->head;
539 540 541 542 543
	}

	return NULL;
}

E
Eric Dumazet 已提交
544
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
545
{
546
	int prio;
547
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
548 549

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
550
		__qdisc_reset_queue(band2list(priv, prio));
551

552
	priv->bitmap = 0;
553
	qdisc->qstats.backlog = 0;
554
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
555 556
}

557 558 559 560
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
561
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
562 563
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
564 565 566 567 568 569 570 571 572
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
573
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
574 575

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
576
		qdisc_skb_head_init(band2list(priv, prio));
577

578 579
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
580 581 582
	return 0;
}

583
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
584
	.id		=	"pfifo_fast",
585
	.priv_size	=	sizeof(struct pfifo_fast_priv),
586 587
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
588
	.peek		=	pfifo_fast_peek,
589 590 591
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
592 593
	.owner		=	THIS_MODULE,
};
594
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
595

596
static struct lock_class_key qdisc_tx_busylock;
597
static struct lock_class_key qdisc_running_key;
598

599
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
600
			  const struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
601 602 603
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
604
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
605
	int err = -ENOBUFS;
606 607 608 609 610 611
	struct net_device *dev;

	if (!dev_queue) {
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
612

613
	dev = dev_queue->dev;
614 615 616
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
617
	if (!p)
618 619
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
620 621 622 623 624 625 626 627 628 629
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
630 631
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
632

633
	spin_lock_init(&sch->busylock);
634 635 636
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

637 638 639 640
	seqcount_init(&sch->running);
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);

L
Linus Torvalds 已提交
641 642 643
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
644
	sch->dev_queue = dev_queue;
645
	dev_hold(dev);
646
	refcount_set(&sch->refcnt, 1);
647 648 649

	return sch;
errout:
650
	return ERR_PTR(err);
651 652
}

653
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
654 655
				const struct Qdisc_ops *ops,
				unsigned int parentid)
656 657
{
	struct Qdisc *sch;
658

659
	if (!try_module_get(ops->owner))
660
		return NULL;
661

662
	sch = qdisc_alloc(dev_queue, ops);
663 664 665 666
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
667
	sch->parent = parentid;
668

L
Linus Torvalds 已提交
669 670 671
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

672
	qdisc_destroy(sch);
L
Linus Torvalds 已提交
673 674
	return NULL;
}
675
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
676

677
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
678 679 680

void qdisc_reset(struct Qdisc *qdisc)
{
681
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
682 683 684

	if (ops->reset)
		ops->reset(qdisc);
685

686 687 688
	kfree_skb(qdisc->skb_bad_txq);
	qdisc->skb_bad_txq = NULL;

689
	if (qdisc->gso_skb) {
690
		kfree_skb_list(qdisc->gso_skb);
691 692
		qdisc->gso_skb = NULL;
	}
693
	qdisc->q.qlen = 0;
694
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
695
}
696
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
697

698
static void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
699
{
700
	if (qdisc_is_percpu_stats(qdisc)) {
701
		free_percpu(qdisc->cpu_bstats);
702 703
		free_percpu(qdisc->cpu_qstats);
	}
704

E
Eric Dumazet 已提交
705 706 707
	kfree((char *) qdisc - qdisc->padded);
}

708
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
709
{
710 711
	const struct Qdisc_ops  *ops = qdisc->ops;

712
	if (qdisc->flags & TCQ_F_BUILTIN ||
713
	    !refcount_dec_and_test(&qdisc->refcnt))
714 715
		return;

716
#ifdef CONFIG_NET_SCHED
717
	qdisc_hash_del(qdisc);
718

E
Eric Dumazet 已提交
719
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
720
#endif
721
	gen_kill_estimator(&qdisc->rate_est);
722 723 724 725 726 727 728 729
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

730
	kfree_skb_list(qdisc->gso_skb);
731
	kfree_skb(qdisc->skb_bad_txq);
732
	qdisc_free(qdisc);
L
Linus Torvalds 已提交
733
}
734
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
735

736 737 738 739 740 741 742 743 744 745 746
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
747
	if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1)
748 749 750 751 752 753 754 755 756 757 758 759
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
760
EXPORT_SYMBOL(dev_graft_qdisc);
761

762 763 764 765
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
766 767
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
768

769 770 771 772 773 774 775
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;

	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT);
	if (!qdisc) {
		netdev_info(dev, "activation failed\n");
		return;
776
	}
777
	if (!netif_is_multiqueue(dev))
778
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
779 780 781
	dev_queue->qdisc_sleeping = qdisc;
}

782 783 784 785 786 787 788
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

789 790
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
791 792
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
793
		qdisc_refcount_inc(dev->qdisc);
794
	} else {
795
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
796 797
		if (qdisc) {
			dev->qdisc = qdisc;
798
			qdisc->ops->attach(qdisc);
799 800
		}
	}
801
#ifdef CONFIG_NET_SCHED
802
	if (dev->qdisc != &noop_qdisc)
803
		qdisc_hash_add(dev->qdisc, false);
804
#endif
805 806
}

807 808 809 810
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
811
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
812 813
	int *need_watchdog_p = _need_watchdog;

814 815 816
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

817
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
818
	if (need_watchdog_p) {
819
		dev_queue->trans_start = 0;
820
		*need_watchdog_p = 1;
821
	}
822 823
}

L
Linus Torvalds 已提交
824 825
void dev_activate(struct net_device *dev)
{
826
	int need_watchdog;
827

L
Linus Torvalds 已提交
828
	/* No queueing discipline is attached to device;
829 830
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
831 832
	 */

833 834
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
835

836 837 838 839
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

840 841
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
842 843
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
844 845

	if (need_watchdog) {
846
		netif_trans_update(dev);
L
Linus Torvalds 已提交
847 848
		dev_watchdog_up(dev);
	}
849
}
850
EXPORT_SYMBOL(dev_activate);
851

852 853 854
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
855
{
856
	struct Qdisc *qdisc_default = _qdisc_default;
857 858
	struct Qdisc *qdisc;

859
	qdisc = rtnl_dereference(dev_queue->qdisc);
860
	if (qdisc) {
861 862
		spin_lock_bh(qdisc_lock(qdisc));

863 864 865
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

866
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
867
		qdisc_reset(qdisc);
868

869
		spin_unlock_bh(qdisc_lock(qdisc));
870
	}
L
Linus Torvalds 已提交
871 872
}

873
static bool some_qdisc_is_busy(struct net_device *dev)
874 875 876 877 878
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
879
		spinlock_t *root_lock;
880
		struct Qdisc *q;
881 882 883
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
884
		q = dev_queue->qdisc_sleeping;
885
		root_lock = qdisc_lock(q);
886

887
		spin_lock_bh(root_lock);
888

889
		val = (qdisc_is_running(q) ||
890
		       test_bit(__QDISC_STATE_SCHED, &q->state));
891

892
		spin_unlock_bh(root_lock);
893 894 895 896 897 898 899

		if (val)
			return true;
	}
	return false;
}

900 901 902 903 904 905 906
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
907
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
908
{
909
	struct net_device *dev;
910
	bool sync_needed = false;
911

912
	list_for_each_entry(dev, head, close_list) {
913 914 915 916 917 918 919
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
920
		sync_needed |= !dev->dismantle;
921
	}
L
Linus Torvalds 已提交
922

923 924 925 926 927 928
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
	if (sync_needed)
		synchronize_net();
L
Linus Torvalds 已提交
929

930
	/* Wait for outstanding qdisc_run calls. */
931
	list_for_each_entry(dev, head, close_list)
932 933 934 935 936 937 938 939
		while (some_qdisc_is_busy(dev))
			yield();
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

940
	list_add(&dev->close_list, &single);
941
	dev_deactivate_many(&single);
942
	list_del(&single);
L
Linus Torvalds 已提交
943
}
944
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
945

946 947
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
948
				     void *_qdisc)
949
{
950 951
	struct Qdisc *qdisc = _qdisc;

952
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
953 954 955
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
956 957
void dev_init_scheduler(struct net_device *dev)
{
958
	dev->qdisc = &noop_qdisc;
959
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
960 961
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
962

963
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
964 965
}

966 967 968
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
969
{
970
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
971
	struct Qdisc *qdisc_default = _qdisc_default;
972 973

	if (qdisc) {
974
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
975
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
976 977

		qdisc_destroy(qdisc);
978
	}
979 980 981 982
}

void dev_shutdown(struct net_device *dev)
{
983
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
984 985
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
986 987 988
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

989
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
990
}
991

992
void psched_ratecfg_precompute(struct psched_ratecfg *r,
993 994
			       const struct tc_ratespec *conf,
			       u64 rate64)
995
{
996 997
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
998
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
999
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1000 1001
	r->mult = 1;
	/*
1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1013
	 */
1014 1015 1016 1017 1018 1019
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1020
				break;
1021 1022
			factor <<= 1;
			r->shift++;
1023 1024 1025 1026
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
	struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
	 * we are about to modify. So wait until previous call_rcu_bh callback
	 * is done.
	 */
	rcu_barrier_bh();
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
		/* This is counterpart of the rcu barrier above. We need to
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
		call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);