sch_generic.c 19.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <net/pkt_sched.h>

/* Main transmission queue. */

31
/* Modifications to data participating in scheduling must be protected with
32
 * qdisc_lock(qdisc) spinlock.
33 34
 *
 * The idea is the following:
35 36
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
37
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
38 39
 */

40
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
41
{
42
	q->gso_skb = skb;
43
	q->qstats.requeues++;
44
	q->q.qlen++;	/* it's still part of the queue */
45
	__netif_schedule(q);
46

47 48 49
	return 0;
}

50
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
51
{
52 53
	struct sk_buff *skb = q->gso_skb;

54 55 56 57 58 59
	if (unlikely(skb)) {
		struct net_device *dev = qdisc_dev(q);
		struct netdev_queue *txq;

		/* check the reason of requeuing without tx lock first */
		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
60 61
		if (!netif_tx_queue_stopped(txq) &&
		    !netif_tx_queue_frozen(txq)) {
62
			q->gso_skb = NULL;
63 64
			q->q.qlen--;
		} else
65 66
			skb = NULL;
	} else {
67
		skb = q->dequeue(q);
68
	}
69 70 71 72

	return skb;
}

73
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
74
					   struct netdev_queue *dev_queue,
75
					   struct Qdisc *q)
76
{
77
	int ret;
78

79
	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
80 81 82 83 84 85
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
86
		kfree_skb(skb);
87 88
		if (net_ratelimit())
			printk(KERN_WARNING "Dead loop on netdevice %s, "
89
			       "fix it urgently!\n", dev_queue->dev->name);
90 91 92 93 94 95 96
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
		__get_cpu_var(netdev_rx_stat).cpu_collision++;
97
		ret = dev_requeue_skb(skb, q);
98 99
	}

100
	return ret;
101 102
}

103
/*
104 105 106
 * Transmit one skb, and handle the return status as required. Holding the
 * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
 * function.
107 108 109 110 111
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 */
112 113 114
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		    struct net_device *dev, struct netdev_queue *txq,
		    spinlock_t *root_lock)
L
Linus Torvalds 已提交
115
{
116
	int ret = NETDEV_TX_BUSY;
117 118 119

	/* And release qdisc */
	spin_unlock(root_lock);
120

121
	HARD_TX_LOCK(dev, txq, smp_processor_id());
122
	if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
123
		ret = dev_hard_start_xmit(skb, dev, txq);
124

125
	HARD_TX_UNLOCK(dev, txq);
126

127
	spin_lock(root_lock);
128

129 130
	if (dev_xmit_complete(ret)) {
		/* Driver sent out skb successfully or skb was consumed */
131
		ret = qdisc_qlen(q);
132
	} else if (ret == NETDEV_TX_LOCKED) {
133
		/* Driver try lock failed */
134
		ret = handle_dev_cpu_collision(skb, txq, q);
135
	} else {
136 137 138 139 140
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
			printk(KERN_WARNING "BUG %s code %d qlen %d\n",
			       dev->name, ret, q->q.qlen);

141
		ret = dev_requeue_skb(skb, q);
142
	}
143

144 145
	if (ret && (netif_tx_queue_stopped(txq) ||
		    netif_tx_queue_frozen(txq)))
146 147
		ret = 0;

148
	return ret;
L
Linus Torvalds 已提交
149 150
}

151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
 * __QDISC_STATE_RUNNING guarantees only one CPU can process
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
static inline int qdisc_restart(struct Qdisc *q)
{
	struct netdev_queue *txq;
	struct net_device *dev;
	spinlock_t *root_lock;
	struct sk_buff *skb;

	/* Dequeue packet */
	skb = dequeue_skb(q);
	if (unlikely(!skb))
		return 0;

	root_lock = qdisc_lock(q);
	dev = qdisc_dev(q);
	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));

	return sch_direct_xmit(skb, q, dev, txq, root_lock);
}

189
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
190
{
191 192
	unsigned long start_time = jiffies;

193
	while (qdisc_restart(q)) {
194 195 196 197 198 199
		/*
		 * Postpone processing if
		 * 1. another process needs the CPU;
		 * 2. we've been doing it for too long.
		 */
		if (need_resched() || jiffies != start_time) {
200
			__netif_schedule(q);
201
			break;
202 203
		}
	}
H
Herbert Xu 已提交
204

205
	clear_bit(__QDISC_STATE_RUNNING, &q->state);
H
Herbert Xu 已提交
206 207
}

208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
unsigned long dev_trans_start(struct net_device *dev)
{
	unsigned long val, res = dev->trans_start;
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
	dev->trans_start = res;
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

L
Linus Torvalds 已提交
223 224 225 226
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
227
	netif_tx_lock(dev);
228
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
229 230 231
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
232
			int some_queue_timedout = 0;
233
			unsigned int i;
234
			unsigned long trans_start;
235 236 237 238 239

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
240 241 242 243 244 245 246 247
				/*
				 * old device drivers set dev->trans_start
				 */
				trans_start = txq->trans_start ? : dev->trans_start;
				if (netif_tx_queue_stopped(txq) &&
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
248 249 250
					break;
				}
			}
251

252
			if (some_queue_timedout) {
253
				char drivername[64];
254 255
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
				       dev->name, netdev_drivername(dev, drivername, 64), i);
256
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
257
			}
258 259 260
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
261 262 263
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
264
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
265 266 267 268 269 270

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
271
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
272 273
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
274 275
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
276 277 278 279 280 281 282 283 284 285 286
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
287
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
288
	if (del_timer(&dev->watchdog_timer))
289
		dev_put(dev);
H
Herbert Xu 已提交
290
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
291 292
}

293 294 295 296 297 298
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
299 300
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
301
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
302 303
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
304
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
305 306 307
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
308
}
309
EXPORT_SYMBOL(netif_carrier_on);
310

311 312 313 314 315 316
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
317 318
void netif_carrier_off(struct net_device *dev)
{
319 320 321
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
322
		linkwatch_fire_event(dev);
323
	}
324
}
325
EXPORT_SYMBOL(netif_carrier_off);
326

L
Linus Torvalds 已提交
327 328 329 330 331
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

332
static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
L
Linus Torvalds 已提交
333 334 335 336 337
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

338
static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
L
Linus Torvalds 已提交
339 340 341 342
{
	return NULL;
}

343
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
344 345 346 347
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
348
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
349 350 351
	.owner		=	THIS_MODULE,
};

352 353
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
354
	.qdisc_sleeping	=	&noop_qdisc,
355 356
};

L
Linus Torvalds 已提交
357 358 359 360
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
361
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
362
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
363
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
364
	.dev_queue	=	&noop_netdev_queue,
L
Linus Torvalds 已提交
365
};
366
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
367

368
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
369 370 371 372
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
373
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
374 375 376
	.owner		=	THIS_MODULE,
};

377 378 379
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
	.qdisc		=	&noqueue_qdisc,
380
	.qdisc_sleeping	=	&noqueue_qdisc,
381 382
};

L
Linus Torvalds 已提交
383 384 385 386 387 388
static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
389 390
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
	.dev_queue	=	&noqueue_netdev_queue,
L
Linus Torvalds 已提交
391 392 393
};


394 395 396 397 398 399 400 401 402
static const u8 prio2band[TC_PRIO_MAX+1] =
	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
	struct sk_buff_head q[PFIFO_FAST_BANDS];
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
					     int band)
423
{
424
	return priv->q + band;
425 426 427
}

static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
428
{
429 430 431 432
	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
		struct sk_buff_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
433

434
		priv->bitmap |= (1 << band);
435
		qdisc->q.qlen++;
436
		return __qdisc_enqueue_tail(skb, qdisc, list);
437
	}
438 439

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
440 441
}

442
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
443
{
444 445
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
446

447 448 449 450 451 452 453 454 455
	if (likely(band >= 0)) {
		struct sk_buff_head *list = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);

		qdisc->q.qlen--;
		if (skb_queue_empty(list))
			priv->bitmap &= ~(1 << band);

		return skb;
456
	}
457

L
Linus Torvalds 已提交
458 459 460
	return NULL;
}

461 462
static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
{
463 464 465 466 467
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
		struct sk_buff_head *list = band2list(priv, band);
468

469
		return skb_peek(list);
470 471 472 473 474
	}

	return NULL;
}

475
static void pfifo_fast_reset(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
476
{
477
	int prio;
478
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
479 480

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
481
		__qdisc_reset_queue(qdisc, band2list(priv, prio));
482

483
	priv->bitmap = 0;
484
	qdisc->qstats.backlog = 0;
485
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
486 487
}

488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
503
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
504 505

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
506
		skb_queue_head_init(band2list(priv, prio));
507 508 509 510

	return 0;
}

511
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
512
	.id		=	"pfifo_fast",
513
	.priv_size	=	sizeof(struct pfifo_fast_priv),
514 515
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
516
	.peek		=	pfifo_fast_peek,
517 518 519
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
520 521 522
	.owner		=	THIS_MODULE,
};

523
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
524
			  struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
525 526 527
{
	void *p;
	struct Qdisc *sch;
528 529
	unsigned int size;
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
530 531

	/* ensure that the Qdisc and the private data are 32-byte aligned */
532 533
	size = QDISC_ALIGN(sizeof(*sch));
	size += ops->priv_size + (QDISC_ALIGNTO - 1);
L
Linus Torvalds 已提交
534

535
	p = kzalloc(size, GFP_KERNEL);
L
Linus Torvalds 已提交
536
	if (!p)
537 538 539
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
	sch->padded = (char *) sch - (char *) p;
L
Linus Torvalds 已提交
540 541 542 543 544 545

	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
546
	sch->dev_queue = dev_queue;
547
	dev_hold(qdisc_dev(sch));
L
Linus Torvalds 已提交
548
	atomic_set(&sch->refcnt, 1);
549 550 551

	return sch;
errout:
552
	return ERR_PTR(err);
553 554
}

555 556 557
struct Qdisc * qdisc_create_dflt(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 struct Qdisc_ops *ops,
558
				 unsigned int parentid)
559 560
{
	struct Qdisc *sch;
561

562
	sch = qdisc_alloc(dev_queue, ops);
563 564
	if (IS_ERR(sch))
		goto errout;
565
	sch->parent = parentid;
566

L
Linus Torvalds 已提交
567 568 569
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

570
	qdisc_destroy(sch);
571
errout:
L
Linus Torvalds 已提交
572 573
	return NULL;
}
574
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
575

576
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
577 578 579

void qdisc_reset(struct Qdisc *qdisc)
{
580
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
581 582 583

	if (ops->reset)
		ops->reset(qdisc);
584

585 586 587 588 589
	if (qdisc->gso_skb) {
		kfree_skb(qdisc->gso_skb);
		qdisc->gso_skb = NULL;
		qdisc->q.qlen = 0;
	}
L
Linus Torvalds 已提交
590
}
591
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
592

593
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
594
{
595 596
	const struct Qdisc_ops  *ops = qdisc->ops;

597 598 599 600
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !atomic_dec_and_test(&qdisc->refcnt))
		return;

601
#ifdef CONFIG_NET_SCHED
602 603
	qdisc_list_del(qdisc);

604
	qdisc_put_stab(qdisc->stab);
605
#endif
606 607 608 609 610 611 612 613 614
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

615
	kfree_skb(qdisc->gso_skb);
L
Linus Torvalds 已提交
616 617
	kfree((char *) qdisc - qdisc->padded);
}
618
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
619

620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}

645 646 647 648 649 650 651 652
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
	struct Qdisc *qdisc;

	if (dev->tx_queue_len) {
		qdisc = qdisc_create_dflt(dev, dev_queue,
653
					  &pfifo_fast_ops, TC_H_ROOT);
654 655 656 657
		if (!qdisc) {
			printk(KERN_INFO "%s: activation failed\n", dev->name);
			return;
		}
658 659 660

		/* Can by-pass the queue discipline for default qdisc */
		qdisc->flags |= TCQ_F_CAN_BYPASS;
661 662 663 664 665 666
	} else {
		qdisc =  &noqueue_qdisc;
	}
	dev_queue->qdisc_sleeping = qdisc;
}

667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		atomic_inc(&dev->qdisc->refcnt);
	} else {
		qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT);
		if (qdisc) {
			qdisc->ops->attach(qdisc);
			dev->qdisc = qdisc;
		}
	}
}

687 688 689 690
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
691
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
692 693
	int *need_watchdog_p = _need_watchdog;

694 695 696
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

697
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
698 699
	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
		dev_queue->trans_start = 0;
700
		*need_watchdog_p = 1;
701
	}
702 703
}

L
Linus Torvalds 已提交
704 705
void dev_activate(struct net_device *dev)
{
706
	int need_watchdog;
707

L
Linus Torvalds 已提交
708
	/* No queueing discipline is attached to device;
709 710 711
	   create default one i.e. pfifo_fast for devices,
	   which need queueing and noqueue_qdisc for
	   virtual interfaces
L
Linus Torvalds 已提交
712 713
	 */

714 715
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
716

717 718 719 720
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

721 722
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
723
	transition_one_qdisc(dev, &dev->rx_queue, NULL);
724 725

	if (need_watchdog) {
L
Linus Torvalds 已提交
726 727 728
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
729 730
}

731 732 733
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
734
{
735
	struct Qdisc *qdisc_default = _qdisc_default;
736 737 738
	struct Qdisc *qdisc;

	qdisc = dev_queue->qdisc;
739
	if (qdisc) {
740 741
		spin_lock_bh(qdisc_lock(qdisc));

742 743 744
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

745
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
746
		qdisc_reset(qdisc);
747

748
		spin_unlock_bh(qdisc_lock(qdisc));
749
	}
L
Linus Torvalds 已提交
750 751
}

752
static bool some_qdisc_is_busy(struct net_device *dev)
753 754 755 756 757
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
758
		spinlock_t *root_lock;
759
		struct Qdisc *q;
760 761 762
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
763
		q = dev_queue->qdisc_sleeping;
764
		root_lock = qdisc_lock(q);
765

766
		spin_lock_bh(root_lock);
767

768 769
		val = (test_bit(__QDISC_STATE_RUNNING, &q->state) ||
		       test_bit(__QDISC_STATE_SCHED, &q->state));
770

771
		spin_unlock_bh(root_lock);
772 773 774 775 776 777 778

		if (val)
			return true;
	}
	return false;
}

L
Linus Torvalds 已提交
779 780
void dev_deactivate(struct net_device *dev)
{
781
	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
782
	dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
783

L
Linus Torvalds 已提交
784 785
	dev_watchdog_down(dev);

786
	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
787
	synchronize_rcu();
L
Linus Torvalds 已提交
788

789
	/* Wait for outstanding qdisc_run calls. */
790 791
	while (some_qdisc_is_busy(dev))
		yield();
L
Linus Torvalds 已提交
792 793
}

794 795
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
796
				     void *_qdisc)
797
{
798 799
	struct Qdisc *qdisc = _qdisc;

800 801 802 803
	dev_queue->qdisc = qdisc;
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
804 805
void dev_init_scheduler(struct net_device *dev)
{
806
	dev->qdisc = &noop_qdisc;
807
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
808
	dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
L
Linus Torvalds 已提交
809

810
	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
L
Linus Torvalds 已提交
811 812
}

813 814 815
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
816
{
817
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
818
	struct Qdisc *qdisc_default = _qdisc_default;
819 820

	if (qdisc) {
821
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
822
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
823 824

		qdisc_destroy(qdisc);
825
	}
826 827 828 829
}

void dev_shutdown(struct net_device *dev)
{
830
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
831
	shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
832 833 834
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

835
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
836
}