sch_generic.c 19.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <net/pkt_sched.h>

/* Main transmission queue. */

31
/* Modifications to data participating in scheduling must be protected with
32
 * qdisc_lock(qdisc) spinlock.
33 34
 *
 * The idea is the following:
35 36
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
37
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
38 39
 */

40
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
41
{
42
	q->gso_skb = skb;
43
	q->qstats.requeues++;
44
	q->q.qlen++;	/* it's still part of the queue */
45
	__netif_schedule(q);
46

47 48 49
	return 0;
}

50
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
51
{
52 53
	struct sk_buff *skb = q->gso_skb;

54 55 56 57 58 59
	if (unlikely(skb)) {
		struct net_device *dev = qdisc_dev(q);
		struct netdev_queue *txq;

		/* check the reason of requeuing without tx lock first */
		txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
60 61
		if (!netif_tx_queue_stopped(txq) &&
		    !netif_tx_queue_frozen(txq)) {
62
			q->gso_skb = NULL;
63 64
			q->q.qlen--;
		} else
65 66
			skb = NULL;
	} else {
67
		skb = q->dequeue(q);
68
	}
69 70 71 72

	return skb;
}

73
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
74
					   struct netdev_queue *dev_queue,
75
					   struct Qdisc *q)
76
{
77
	int ret;
78

79
	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
80 81 82 83 84 85
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
86
		kfree_skb(skb);
87 88
		if (net_ratelimit())
			printk(KERN_WARNING "Dead loop on netdevice %s, "
89
			       "fix it urgently!\n", dev_queue->dev->name);
90 91 92 93 94 95 96
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
		__get_cpu_var(netdev_rx_stat).cpu_collision++;
97
		ret = dev_requeue_skb(skb, q);
98 99
	}

100
	return ret;
101 102
}

103
/*
104 105 106
 * Transmit one skb, and handle the return status as required. Holding the
 * __QDISC_STATE_RUNNING bit guarantees that only one CPU can execute this
 * function.
107 108 109 110 111
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 */
112 113 114
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		    struct net_device *dev, struct netdev_queue *txq,
		    spinlock_t *root_lock)
L
Linus Torvalds 已提交
115
{
116
	int ret = NETDEV_TX_BUSY;
117 118 119

	/* And release qdisc */
	spin_unlock(root_lock);
120

121
	HARD_TX_LOCK(dev, txq, smp_processor_id());
122 123
	if (!netif_tx_queue_stopped(txq) &&
	    !netif_tx_queue_frozen(txq))
124
		ret = dev_hard_start_xmit(skb, dev, txq);
125
	HARD_TX_UNLOCK(dev, txq);
126

127
	spin_lock(root_lock);
128

129 130 131 132 133 134 135 136
	switch (ret) {
	case NETDEV_TX_OK:
		/* Driver sent out skb successfully */
		ret = qdisc_qlen(q);
		break;

	case NETDEV_TX_LOCKED:
		/* Driver try lock failed */
137
		ret = handle_dev_cpu_collision(skb, txq, q);
138 139 140 141 142 143 144 145
		break;

	default:
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
			printk(KERN_WARNING "BUG %s code %d qlen %d\n",
			       dev->name, ret, q->q.qlen);

146
		ret = dev_requeue_skb(skb, q);
147 148
		break;
	}
149

150 151
	if (ret && (netif_tx_queue_stopped(txq) ||
		    netif_tx_queue_frozen(txq)))
152 153
		ret = 0;

154
	return ret;
L
Linus Torvalds 已提交
155 156
}

157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
 * __QDISC_STATE_RUNNING guarantees only one CPU can process
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
static inline int qdisc_restart(struct Qdisc *q)
{
	struct netdev_queue *txq;
	struct net_device *dev;
	spinlock_t *root_lock;
	struct sk_buff *skb;

	/* Dequeue packet */
	skb = dequeue_skb(q);
	if (unlikely(!skb))
		return 0;

	root_lock = qdisc_lock(q);
	dev = qdisc_dev(q);
	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));

	return sch_direct_xmit(skb, q, dev, txq, root_lock);
}

195
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
196
{
197 198
	unsigned long start_time = jiffies;

199
	while (qdisc_restart(q)) {
200 201 202 203 204 205
		/*
		 * Postpone processing if
		 * 1. another process needs the CPU;
		 * 2. we've been doing it for too long.
		 */
		if (need_resched() || jiffies != start_time) {
206
			__netif_schedule(q);
207
			break;
208 209
		}
	}
H
Herbert Xu 已提交
210

211
	clear_bit(__QDISC_STATE_RUNNING, &q->state);
H
Herbert Xu 已提交
212 213
}

214 215 216 217 218 219 220 221 222 223 224 225 226 227 228
unsigned long dev_trans_start(struct net_device *dev)
{
	unsigned long val, res = dev->trans_start;
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
	dev->trans_start = res;
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

L
Linus Torvalds 已提交
229 230 231 232
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
233
	netif_tx_lock(dev);
234
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
235 236 237
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
238
			int some_queue_timedout = 0;
239
			unsigned int i;
240
			unsigned long trans_start;
241 242 243 244 245

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
246 247 248 249 250 251 252 253
				/*
				 * old device drivers set dev->trans_start
				 */
				trans_start = txq->trans_start ? : dev->trans_start;
				if (netif_tx_queue_stopped(txq) &&
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
254 255 256
					break;
				}
			}
257

258
			if (some_queue_timedout) {
259
				char drivername[64];
260 261
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
				       dev->name, netdev_drivername(dev, drivername, 64), i);
262
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
263
			}
264 265 266
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
267 268 269
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
270
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
271 272 273 274 275 276

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
277
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
278 279
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
280 281
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
282 283 284 285 286 287 288 289 290 291 292
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
293
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
294
	if (del_timer(&dev->watchdog_timer))
295
		dev_put(dev);
H
Herbert Xu 已提交
296
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
297 298
}

299 300 301 302 303 304
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
305 306
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
307
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
308 309
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
310
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
311 312 313
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
314
}
315
EXPORT_SYMBOL(netif_carrier_on);
316

317 318 319 320 321 322
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
323 324
void netif_carrier_off(struct net_device *dev)
{
325 326 327
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
328
		linkwatch_fire_event(dev);
329
	}
330
}
331
EXPORT_SYMBOL(netif_carrier_off);
332

L
Linus Torvalds 已提交
333 334 335 336 337
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

338
static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
L
Linus Torvalds 已提交
339 340 341 342 343
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

344
static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
L
Linus Torvalds 已提交
345 346 347 348
{
	return NULL;
}

349
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
350 351 352 353
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
354
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
355 356 357
	.owner		=	THIS_MODULE,
};

358 359
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
360
	.qdisc_sleeping	=	&noop_qdisc,
361 362
};

L
Linus Torvalds 已提交
363 364 365 366
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
367
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
368
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
369
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
370
	.dev_queue	=	&noop_netdev_queue,
L
Linus Torvalds 已提交
371
};
372
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
373

374
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
375 376 377 378
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
379
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
380 381 382
	.owner		=	THIS_MODULE,
};

383 384 385
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
	.qdisc		=	&noqueue_qdisc,
386
	.qdisc_sleeping	=	&noqueue_qdisc,
387 388
};

L
Linus Torvalds 已提交
389 390 391 392 393 394
static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
395 396
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
	.dev_queue	=	&noqueue_netdev_queue,
L
Linus Torvalds 已提交
397 398 399
};


400 401 402 403 404 405 406 407 408
static const u8 prio2band[TC_PRIO_MAX+1] =
	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
	struct sk_buff_head q[PFIFO_FAST_BANDS];
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
					     int band)
429
{
430
	return priv->q + band;
431 432 433
}

static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
434
{
435 436 437 438
	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
		struct sk_buff_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
439

440
		priv->bitmap |= (1 << band);
441
		qdisc->q.qlen++;
442
		return __qdisc_enqueue_tail(skb, qdisc, list);
443
	}
444 445

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
446 447
}

448
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
449
{
450 451
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
452

453 454 455 456 457 458 459 460 461
	if (likely(band >= 0)) {
		struct sk_buff_head *list = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);

		qdisc->q.qlen--;
		if (skb_queue_empty(list))
			priv->bitmap &= ~(1 << band);

		return skb;
462
	}
463

L
Linus Torvalds 已提交
464 465 466
	return NULL;
}

467 468
static struct sk_buff *pfifo_fast_peek(struct Qdisc* qdisc)
{
469 470 471 472 473
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
		struct sk_buff_head *list = band2list(priv, band);
474

475
		return skb_peek(list);
476 477 478 479 480
	}

	return NULL;
}

481
static void pfifo_fast_reset(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
482
{
483
	int prio;
484
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
485 486

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
487
		__qdisc_reset_queue(qdisc, band2list(priv, prio));
488

489
	priv->bitmap = 0;
490
	qdisc->qstats.backlog = 0;
491
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
492 493
}

494 495 496 497 498 499 500 501 502 503 504 505 506 507 508
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
509
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
510 511

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
512
		skb_queue_head_init(band2list(priv, prio));
513 514 515 516 517 518

	return 0;
}

static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
	.id		=	"pfifo_fast",
519
	.priv_size	=	sizeof(struct pfifo_fast_priv),
520 521
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
522
	.peek		=	pfifo_fast_peek,
523 524 525
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
526 527 528
	.owner		=	THIS_MODULE,
};

529
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
530
			  struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
531 532 533
{
	void *p;
	struct Qdisc *sch;
534 535
	unsigned int size;
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
536 537

	/* ensure that the Qdisc and the private data are 32-byte aligned */
538 539
	size = QDISC_ALIGN(sizeof(*sch));
	size += ops->priv_size + (QDISC_ALIGNTO - 1);
L
Linus Torvalds 已提交
540

541
	p = kzalloc(size, GFP_KERNEL);
L
Linus Torvalds 已提交
542
	if (!p)
543 544 545
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
	sch->padded = (char *) sch - (char *) p;
L
Linus Torvalds 已提交
546 547 548 549 550 551

	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
552
	sch->dev_queue = dev_queue;
553
	dev_hold(qdisc_dev(sch));
L
Linus Torvalds 已提交
554
	atomic_set(&sch->refcnt, 1);
555 556 557

	return sch;
errout:
558
	return ERR_PTR(err);
559 560
}

561 562 563
struct Qdisc * qdisc_create_dflt(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 struct Qdisc_ops *ops,
564
				 unsigned int parentid)
565 566
{
	struct Qdisc *sch;
567

568
	sch = qdisc_alloc(dev_queue, ops);
569 570
	if (IS_ERR(sch))
		goto errout;
571
	sch->parent = parentid;
572

L
Linus Torvalds 已提交
573 574 575
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

576
	qdisc_destroy(sch);
577
errout:
L
Linus Torvalds 已提交
578 579
	return NULL;
}
580
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
581

582
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
583 584 585

void qdisc_reset(struct Qdisc *qdisc)
{
586
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
587 588 589

	if (ops->reset)
		ops->reset(qdisc);
590

591 592 593 594 595
	if (qdisc->gso_skb) {
		kfree_skb(qdisc->gso_skb);
		qdisc->gso_skb = NULL;
		qdisc->q.qlen = 0;
	}
L
Linus Torvalds 已提交
596
}
597
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
598

599
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
600
{
601 602
	const struct Qdisc_ops  *ops = qdisc->ops;

603 604 605 606
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !atomic_dec_and_test(&qdisc->refcnt))
		return;

607
#ifdef CONFIG_NET_SCHED
608 609
	qdisc_list_del(qdisc);

610
	qdisc_put_stab(qdisc->stab);
611
#endif
612 613 614 615 616 617 618 619 620
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

621
	kfree_skb(qdisc->gso_skb);
L
Linus Torvalds 已提交
622 623
	kfree((char *) qdisc - qdisc->padded);
}
624
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
625

626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}

651 652 653 654 655 656 657 658
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
	struct Qdisc *qdisc;

	if (dev->tx_queue_len) {
		qdisc = qdisc_create_dflt(dev, dev_queue,
659
					  &pfifo_fast_ops, TC_H_ROOT);
660 661 662 663
		if (!qdisc) {
			printk(KERN_INFO "%s: activation failed\n", dev->name);
			return;
		}
664 665 666

		/* Can by-pass the queue discipline for default qdisc */
		qdisc->flags |= TCQ_F_CAN_BYPASS;
667 668 669 670 671 672 673 674 675 676
	} else {
		qdisc =  &noqueue_qdisc;
	}
	dev_queue->qdisc_sleeping = qdisc;
}

static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
677
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
678 679
	int *need_watchdog_p = _need_watchdog;

680 681 682
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

683
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
684 685
	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
		dev_queue->trans_start = 0;
686
		*need_watchdog_p = 1;
687
	}
688 689
}

L
Linus Torvalds 已提交
690 691
void dev_activate(struct net_device *dev)
{
692
	struct netdev_queue *txq;
693
	int need_watchdog;
694

L
Linus Torvalds 已提交
695
	/* No queueing discipline is attached to device;
696 697 698
	   create default one i.e. pfifo_fast for devices,
	   which need queueing and noqueue_qdisc for
	   virtual interfaces
L
Linus Torvalds 已提交
699 700
	 */

701
	if (dev->qdisc == &noop_qdisc) {
702
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
L
Linus Torvalds 已提交
703

704 705 706 707 708
		txq = netdev_get_tx_queue(dev, 0);
		dev->qdisc = txq->qdisc_sleeping;
		atomic_inc(&dev->qdisc->refcnt);
	}

709 710 711 712
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

713 714
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
715
	transition_one_qdisc(dev, &dev->rx_queue, NULL);
716 717

	if (need_watchdog) {
L
Linus Torvalds 已提交
718 719 720
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
721 722
}

723 724 725
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
726
{
727
	struct Qdisc *qdisc_default = _qdisc_default;
728 729 730
	struct Qdisc *qdisc;

	qdisc = dev_queue->qdisc;
731
	if (qdisc) {
732 733
		spin_lock_bh(qdisc_lock(qdisc));

734 735 736
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

737
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
738
		qdisc_reset(qdisc);
739

740
		spin_unlock_bh(qdisc_lock(qdisc));
741
	}
L
Linus Torvalds 已提交
742 743
}

744
static bool some_qdisc_is_busy(struct net_device *dev)
745 746 747 748 749
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
750
		spinlock_t *root_lock;
751
		struct Qdisc *q;
752 753 754
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
755
		q = dev_queue->qdisc_sleeping;
756
		root_lock = qdisc_lock(q);
757

758
		spin_lock_bh(root_lock);
759

760 761
		val = (test_bit(__QDISC_STATE_RUNNING, &q->state) ||
		       test_bit(__QDISC_STATE_SCHED, &q->state));
762

763
		spin_unlock_bh(root_lock);
764 765 766 767 768 769 770

		if (val)
			return true;
	}
	return false;
}

L
Linus Torvalds 已提交
771 772
void dev_deactivate(struct net_device *dev)
{
773
	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
774
	dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
775

L
Linus Torvalds 已提交
776 777
	dev_watchdog_down(dev);

778
	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
779
	synchronize_rcu();
L
Linus Torvalds 已提交
780

781
	/* Wait for outstanding qdisc_run calls. */
782 783
	while (some_qdisc_is_busy(dev))
		yield();
L
Linus Torvalds 已提交
784 785
}

786 787
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
788
				     void *_qdisc)
789
{
790 791
	struct Qdisc *qdisc = _qdisc;

792 793 794 795
	dev_queue->qdisc = qdisc;
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
796 797
void dev_init_scheduler(struct net_device *dev)
{
798
	dev->qdisc = &noop_qdisc;
799
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
800
	dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
L
Linus Torvalds 已提交
801

802
	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
L
Linus Torvalds 已提交
803 804
}

805 806 807
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
808
{
809
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
810
	struct Qdisc *qdisc_default = _qdisc_default;
811 812

	if (qdisc) {
813
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
814
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
815 816

		qdisc_destroy(qdisc);
817
	}
818 819 820 821
}

void dev_shutdown(struct net_device *dev)
{
822
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
823
	shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
824 825 826
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

827
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
828
}