sch_generic.c 17.5 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <net/pkt_sched.h>

/* Main transmission queue. */

31
/* Modifications to data participating in scheduling must be protected with
32
 * queue->lock spinlock.
33 34 35
 *
 * The idea is the following:
 * - enqueue, dequeue are serialized via top level device
36
 *   spinlock queue->lock.
37
 * - ingress filtering is serialized via top level device
38
 *   spinlock dev->rx_queue.lock.
39
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
40 41 42
 */

void qdisc_lock_tree(struct net_device *dev)
43
	__acquires(dev->rx_queue.lock)
L
Linus Torvalds 已提交
44
{
45 46 47 48 49 50 51
	unsigned int i;

	local_bh_disable();
	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
		spin_lock(&txq->lock);
	}
52
	spin_lock(&dev->rx_queue.lock);
L
Linus Torvalds 已提交
53
}
54
EXPORT_SYMBOL(qdisc_lock_tree);
L
Linus Torvalds 已提交
55 56

void qdisc_unlock_tree(struct net_device *dev)
57
	__releases(dev->rx_queue.lock)
L
Linus Torvalds 已提交
58
{
59 60
	unsigned int i;

61
	spin_unlock(&dev->rx_queue.lock);
62 63 64 65 66
	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
		spin_unlock(&txq->lock);
	}
	local_bh_enable();
L
Linus Torvalds 已提交
67
}
68
EXPORT_SYMBOL(qdisc_unlock_tree);
L
Linus Torvalds 已提交
69

70 71 72 73 74
static inline int qdisc_qlen(struct Qdisc *q)
{
	return q->q.qlen;
}

75
static inline int dev_requeue_skb(struct sk_buff *skb,
76
				  struct netdev_queue *dev_queue,
77
				  struct Qdisc *q)
78 79
{
	if (unlikely(skb->next))
80
		q->gso_skb = skb;
81 82
	else
		q->ops->requeue(skb, q);
83

84
	netif_schedule_queue(dev_queue);
85 86 87
	return 0;
}

88
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
89
{
90
	struct sk_buff *skb;
91

92 93
	if ((skb = q->gso_skb))
		q->gso_skb = NULL;
94 95 96 97 98 99
	else
		skb = q->dequeue(q);

	return skb;
}

100
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
101
					   struct netdev_queue *dev_queue,
102
					   struct Qdisc *q)
103
{
104
	int ret;
105

106
	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
107 108 109 110 111 112
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
113
		kfree_skb(skb);
114 115
		if (net_ratelimit())
			printk(KERN_WARNING "Dead loop on netdevice %s, "
116
			       "fix it urgently!\n", dev_queue->dev->name);
117 118 119 120 121 122 123
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
		__get_cpu_var(netdev_rx_stat).cpu_collision++;
124
		ret = dev_requeue_skb(skb, dev_queue, q);
125 126
	}

127
	return ret;
128 129
}

130
/*
131
 * NOTE: Called under queue->lock with locally disabled BH.
132
 *
133 134
 * __QDISC_STATE_RUNNING guarantees only one CPU can process
 * this qdisc at a time. queue->lock serializes queue accesses for
135
 * this queue AND txq->qdisc pointer itself.
136 137 138
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
139
 *  queue->lock and netif_tx_lock are mutually exclusive,
140 141 142 143 144 145 146 147 148
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
149 150
static inline int qdisc_restart(struct netdev_queue *txq,
				struct Qdisc *q)
L
Linus Torvalds 已提交
151
{
152
	int ret = NETDEV_TX_BUSY;
153 154
	struct net_device *dev;
	struct sk_buff *skb;
L
Linus Torvalds 已提交
155

156
	/* Dequeue packet */
157
	if (unlikely((skb = dequeue_skb(q)) == NULL))
158
		return 0;
159

160
	/* And release queue */
161
	spin_unlock(&txq->lock);
162

163 164
	dev = txq->dev;

165
	HARD_TX_LOCK(dev, txq, smp_processor_id());
166
	if (!netif_subqueue_stopped(dev, skb))
167
		ret = dev_hard_start_xmit(skb, dev, txq);
168
	HARD_TX_UNLOCK(dev, txq);
169

170
	spin_lock(&txq->lock);
171

172 173 174 175 176 177 178 179
	switch (ret) {
	case NETDEV_TX_OK:
		/* Driver sent out skb successfully */
		ret = qdisc_qlen(q);
		break;

	case NETDEV_TX_LOCKED:
		/* Driver try lock failed */
180
		ret = handle_dev_cpu_collision(skb, txq, q);
181 182 183 184 185 186 187 188
		break;

	default:
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
			printk(KERN_WARNING "BUG %s code %d qlen %d\n",
			       dev->name, ret, q->q.qlen);

189
		ret = dev_requeue_skb(skb, txq, q);
190 191
		break;
	}
192

193
	return ret;
L
Linus Torvalds 已提交
194 195
}

196
void __qdisc_run(struct netdev_queue *txq)
H
Herbert Xu 已提交
197
{
198
	unsigned long start_time = jiffies;
199
	struct Qdisc *q = txq->qdisc;
200

201
	while (qdisc_restart(txq, q)) {
202
		if (netif_tx_queue_stopped(txq))
203 204 205 206 207 208 209 210
			break;

		/*
		 * Postpone processing if
		 * 1. another process needs the CPU;
		 * 2. we've been doing it for too long.
		 */
		if (need_resched() || jiffies != start_time) {
211
			netif_schedule_queue(txq);
212
			break;
213 214
		}
	}
H
Herbert Xu 已提交
215

216
	clear_bit(__QDISC_STATE_RUNNING, &q->state);
H
Herbert Xu 已提交
217 218
}

L
Linus Torvalds 已提交
219 220 221 222
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
223
	netif_tx_lock(dev);
224
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
225 226 227
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
228 229 230 231 232 233 234 235 236 237 238 239
			int some_queue_stopped = 0;
			unsigned int i;

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
				if (netif_tx_queue_stopped(txq)) {
					some_queue_stopped = 1;
					break;
				}
			}
240

241 242 243 244 245
			if (some_queue_stopped &&
			    time_after(jiffies, (dev->trans_start +
						 dev->watchdog_timeo))) {
				printk(KERN_INFO "NETDEV WATCHDOG: %s: "
				       "transmit timed out\n",
246
				       dev->name);
L
Linus Torvalds 已提交
247
				dev->tx_timeout(dev);
248
				WARN_ON_ONCE(1);
L
Linus Torvalds 已提交
249
			}
250 251 252
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
253 254 255
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
256
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
257 258 259 260 261 262 263 264 265

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
	if (dev->tx_timeout) {
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
266 267
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
268 269 270 271 272 273 274 275 276 277 278
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
279
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
280
	if (del_timer(&dev->watchdog_timer))
281
		dev_put(dev);
H
Herbert Xu 已提交
282
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
283 284
}

285 286 287 288 289 290
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
291 292
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
293
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
294
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
295 296 297
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
298
}
299
EXPORT_SYMBOL(netif_carrier_on);
300

301 302 303 304 305 306
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
307 308 309 310 311
void netif_carrier_off(struct net_device *dev)
{
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
		linkwatch_fire_event(dev);
}
312
EXPORT_SYMBOL(netif_carrier_off);
313

L
Linus Torvalds 已提交
314 315 316 317 318
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

319
static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
L
Linus Torvalds 已提交
320 321 322 323 324
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

325
static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
L
Linus Torvalds 已提交
326 327 328 329
{
	return NULL;
}

330
static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
L
Linus Torvalds 已提交
331 332
{
	if (net_ratelimit())
333 334
		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
		       skb->dev->name);
L
Linus Torvalds 已提交
335 336 337 338
	kfree_skb(skb);
	return NET_XMIT_CN;
}

339
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
340 341 342 343 344 345 346 347 348 349 350 351
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.requeue	=	noop_requeue,
	.owner		=	THIS_MODULE,
};

struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
352
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
353 354
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
};
355
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
356

357
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.requeue	=	noop_requeue,
	.owner		=	THIS_MODULE,
};

static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
};


static const u8 prio2band[TC_PRIO_MAX+1] =
	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

382 383
#define PFIFO_FAST_BANDS 3

384 385
static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
					     struct Qdisc *qdisc)
L
Linus Torvalds 已提交
386 387
{
	struct sk_buff_head *list = qdisc_priv(qdisc);
388 389
	return list + prio2band[skb->priority & TC_PRIO_MAX];
}
L
Linus Torvalds 已提交
390

391
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
392 393
{
	struct sk_buff_head *list = prio2list(skb, qdisc);
L
Linus Torvalds 已提交
394

395
	if (skb_queue_len(list) < qdisc_dev(qdisc)->tx_queue_len) {
L
Linus Torvalds 已提交
396
		qdisc->q.qlen++;
397
		return __qdisc_enqueue_tail(skb, qdisc, list);
L
Linus Torvalds 已提交
398
	}
399 400

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
401 402
}

403
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
404 405 406 407
{
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

408 409
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		if (!skb_queue_empty(list + prio)) {
L
Linus Torvalds 已提交
410
			qdisc->q.qlen--;
411
			return __qdisc_dequeue_head(qdisc, list + prio);
L
Linus Torvalds 已提交
412 413
		}
	}
414

L
Linus Torvalds 已提交
415 416 417
	return NULL;
}

418
static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
L
Linus Torvalds 已提交
419 420
{
	qdisc->q.qlen++;
421
	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
L
Linus Torvalds 已提交
422 423
}

424
static void pfifo_fast_reset(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
425 426 427 428
{
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

429
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
430 431 432
		__qdisc_reset_queue(qdisc, list + prio);

	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
433 434 435 436 437
	qdisc->q.qlen = 0;
}

static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
438
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
L
Linus Torvalds 已提交
439 440

	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
441
	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
L
Linus Torvalds 已提交
442 443
	return skb->len;

444
nla_put_failure:
L
Linus Torvalds 已提交
445 446 447
	return -1;
}

448
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
L
Linus Torvalds 已提交
449
{
450
	int prio;
L
Linus Torvalds 已提交
451 452
	struct sk_buff_head *list = qdisc_priv(qdisc);

453 454
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
		skb_queue_head_init(list + prio);
L
Linus Torvalds 已提交
455 456 457 458

	return 0;
}

459
static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
L
Linus Torvalds 已提交
460
	.id		=	"pfifo_fast",
461
	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
L
Linus Torvalds 已提交
462 463 464 465 466 467 468 469 470
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
	.requeue	=	pfifo_fast_requeue,
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
	.owner		=	THIS_MODULE,
};

471
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
472
			  struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
473 474 475
{
	void *p;
	struct Qdisc *sch;
476 477
	unsigned int size;
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
478 479

	/* ensure that the Qdisc and the private data are 32-byte aligned */
480 481
	size = QDISC_ALIGN(sizeof(*sch));
	size += ops->priv_size + (QDISC_ALIGNTO - 1);
L
Linus Torvalds 已提交
482

483
	p = kzalloc(size, GFP_KERNEL);
L
Linus Torvalds 已提交
484
	if (!p)
485 486 487
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
	sch->padded = (char *) sch - (char *) p;
L
Linus Torvalds 已提交
488 489 490 491 492 493

	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
494
	sch->dev_queue = dev_queue;
495
	dev_hold(qdisc_dev(sch));
L
Linus Torvalds 已提交
496
	atomic_set(&sch->refcnt, 1);
497 498 499

	return sch;
errout:
500
	return ERR_PTR(err);
501 502
}

503 504 505
struct Qdisc * qdisc_create_dflt(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 struct Qdisc_ops *ops,
506
				 unsigned int parentid)
507 508
{
	struct Qdisc *sch;
509

510
	sch = qdisc_alloc(dev_queue, ops);
511 512
	if (IS_ERR(sch))
		goto errout;
513
	sch->parent = parentid;
514

L
Linus Torvalds 已提交
515 516 517
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

518
	qdisc_destroy(sch);
519
errout:
L
Linus Torvalds 已提交
520 521
	return NULL;
}
522
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
523

524
/* Under queue->lock and BH! */
L
Linus Torvalds 已提交
525 526 527

void qdisc_reset(struct Qdisc *qdisc)
{
528
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
529 530 531 532

	if (ops->reset)
		ops->reset(qdisc);
}
533
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
534

535
/* this is the rcu callback function to clean up a qdisc when there
L
Linus Torvalds 已提交
536 537 538 539 540 541 542 543
 * are no further references to it */

static void __qdisc_destroy(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
	kfree((char *) qdisc - qdisc->padded);
}

544
/* Under queue->lock and BH! */
L
Linus Torvalds 已提交
545 546 547

void qdisc_destroy(struct Qdisc *qdisc)
{
548
	const struct Qdisc_ops  *ops = qdisc->ops;
L
Linus Torvalds 已提交
549 550

	if (qdisc->flags & TCQ_F_BUILTIN ||
551
	    !atomic_dec_and_test(&qdisc->refcnt))
L
Linus Torvalds 已提交
552 553
		return;

554 555 556 557 558 559
	list_del(&qdisc->list);
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);
L
Linus Torvalds 已提交
560

561
	module_put(ops->owner);
562
	dev_put(qdisc_dev(qdisc));
L
Linus Torvalds 已提交
563 564
	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
}
565
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
566

567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612
static bool dev_all_qdisc_sleeping_noop(struct net_device *dev)
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

		if (txq->qdisc_sleeping != &noop_qdisc)
			return false;
	}
	return true;
}

static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
	struct Qdisc *qdisc;

	if (dev->tx_queue_len) {
		qdisc = qdisc_create_dflt(dev, dev_queue,
					  &pfifo_fast_ops, TC_H_ROOT);
		if (!qdisc) {
			printk(KERN_INFO "%s: activation failed\n", dev->name);
			return;
		}
		list_add_tail(&qdisc->list, &dev_queue->qdisc_list);
	} else {
		qdisc =  &noqueue_qdisc;
	}
	dev_queue->qdisc_sleeping = qdisc;
}

static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
	int *need_watchdog_p = _need_watchdog;

	spin_lock_bh(&dev_queue->lock);
	rcu_assign_pointer(dev_queue->qdisc, dev_queue->qdisc_sleeping);
	if (dev_queue->qdisc != &noqueue_qdisc)
		*need_watchdog_p = 1;
	spin_unlock_bh(&dev_queue->lock);
}

L
Linus Torvalds 已提交
613 614
void dev_activate(struct net_device *dev)
{
615
	int need_watchdog;
616

L
Linus Torvalds 已提交
617 618 619 620 621 622
	/* No queueing discipline is attached to device;
	   create default one i.e. pfifo_fast for devices,
	   which need queueing and noqueue_qdisc for
	   virtual interfaces
	 */

623 624
	if (dev_all_qdisc_sleeping_noop(dev))
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
L
Linus Torvalds 已提交
625

626 627 628 629
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

630 631 632 633
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);

	if (need_watchdog) {
L
Linus Torvalds 已提交
634 635 636
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
637 638
}

639 640 641
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
642
{
643
	struct Qdisc *qdisc_default = _qdisc_default;
644
	struct sk_buff *skb = NULL;
645 646 647
	struct Qdisc *qdisc;

	spin_lock_bh(&dev_queue->lock);
648

649
	qdisc = dev_queue->qdisc;
650 651 652
	if (qdisc) {
		dev_queue->qdisc = qdisc_default;
		qdisc_reset(qdisc);
653 654 655

		skb = qdisc->gso_skb;
		qdisc->gso_skb = NULL;
656
	}
657 658 659 660

	spin_unlock_bh(&dev_queue->lock);

	kfree_skb(skb);
L
Linus Torvalds 已提交
661 662
}

663 664 665 666 667 668
static bool some_qdisc_is_running(struct net_device *dev, int lock)
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
669
		struct Qdisc *q;
670 671 672
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
673
		q = dev_queue->qdisc;
674 675 676 677

		if (lock)
			spin_lock_bh(&dev_queue->lock);

678
		val = test_bit(__QDISC_STATE_RUNNING, &q->state);
679 680 681 682 683 684 685 686 687 688

		if (lock)
			spin_unlock_bh(&dev_queue->lock);

		if (val)
			return true;
	}
	return false;
}

L
Linus Torvalds 已提交
689 690
void dev_deactivate(struct net_device *dev)
{
691
	bool running;
L
Linus Torvalds 已提交
692

693
	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
694

L
Linus Torvalds 已提交
695 696
	dev_watchdog_down(dev);

697
	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
698
	synchronize_rcu();
L
Linus Torvalds 已提交
699

700
	/* Wait for outstanding qdisc_run calls. */
701
	do {
702
		while (some_qdisc_is_running(dev, 0))
703 704 705 706 707 708
			yield();

		/*
		 * Double-check inside queue lock to ensure that all effects
		 * of the queue run are visible when we return.
		 */
709
		running = some_qdisc_is_running(dev, 1);
710 711 712 713 714 715 716 717 718 719

		/*
		 * The running flag should never be set at this point because
		 * we've already set dev->qdisc to noop_qdisc *inside* the same
		 * pair of spin locks.  That is, if any qdisc_run starts after
		 * our initial test it should see the noop_qdisc and then
		 * clear the RUNNING bit before dropping the queue lock.  So
		 * if it is set here then we've found a bug.
		 */
	} while (WARN_ON_ONCE(running));
L
Linus Torvalds 已提交
720 721
}

722 723
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
724
				     void *_qdisc)
725
{
726 727
	struct Qdisc *qdisc = _qdisc;

728 729 730 731 732
	dev_queue->qdisc = qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	INIT_LIST_HEAD(&dev_queue->qdisc_list);
}

L
Linus Torvalds 已提交
733 734 735
void dev_init_scheduler(struct net_device *dev)
{
	qdisc_lock_tree(dev);
736
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
737
	dev_init_scheduler_queue(dev, &dev->rx_queue, NULL);
L
Linus Torvalds 已提交
738 739
	qdisc_unlock_tree(dev);

740
	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
L
Linus Torvalds 已提交
741 742
}

743 744 745
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
746
{
747
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
748
	struct Qdisc *qdisc_default = _qdisc_default;
749 750 751 752

	if (qdisc) {
		dev_queue->qdisc = qdisc_default;
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
753 754

		qdisc_destroy(qdisc);
755
	}
756 757 758 759 760
}

void dev_shutdown(struct net_device *dev)
{
	qdisc_lock_tree(dev);
761 762
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
	shutdown_scheduler_queue(dev, &dev->rx_queue, NULL);
L
Linus Torvalds 已提交
763 764 765
	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
	qdisc_unlock_tree(dev);
}