sch_generic.c 17.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <net/pkt_sched.h>

/* Main transmission queue. */

31
/* Modifications to data participating in scheduling must be protected with
32
 * qdisc_root_lock(qdisc) spinlock.
33 34
 *
 * The idea is the following:
35 36
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
37
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
38 39
 */

40 41 42 43 44
static inline int qdisc_qlen(struct Qdisc *q)
{
	return q->q.qlen;
}

45
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
46 47
{
	if (unlikely(skb->next))
48
		q->gso_skb = skb;
49 50
	else
		q->ops->requeue(skb, q);
51

52
	__netif_schedule(q);
53 54 55
	return 0;
}

56
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
57
{
58
	struct sk_buff *skb;
59

60 61
	if ((skb = q->gso_skb))
		q->gso_skb = NULL;
62 63 64 65 66 67
	else
		skb = q->dequeue(q);

	return skb;
}

68
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
69
					   struct netdev_queue *dev_queue,
70
					   struct Qdisc *q)
71
{
72
	int ret;
73

74
	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
75 76 77 78 79 80
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
81
		kfree_skb(skb);
82 83
		if (net_ratelimit())
			printk(KERN_WARNING "Dead loop on netdevice %s, "
84
			       "fix it urgently!\n", dev_queue->dev->name);
85 86 87 88 89 90 91
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
		__get_cpu_var(netdev_rx_stat).cpu_collision++;
92
		ret = dev_requeue_skb(skb, q);
93 94
	}

95
	return ret;
96 97
}

98
/*
99
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
100
 *
101
 * __QDISC_STATE_RUNNING guarantees only one CPU can process
102 103
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
104 105 106
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
107
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
108 109 110 111 112 113 114 115 116
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
117
static inline int qdisc_restart(struct Qdisc *q)
L
Linus Torvalds 已提交
118
{
119
	struct netdev_queue *txq;
120
	int ret = NETDEV_TX_BUSY;
121
	struct net_device *dev;
122
	spinlock_t *root_lock;
123
	struct sk_buff *skb;
L
Linus Torvalds 已提交
124

125
	/* Dequeue packet */
126
	if (unlikely((skb = dequeue_skb(q)) == NULL))
127
		return 0;
128

129 130 131 132
	root_lock = qdisc_root_lock(q);

	/* And release qdisc */
	spin_unlock(root_lock);
133

134 135
	dev = qdisc_dev(q);
	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));
136

137
	HARD_TX_LOCK(dev, txq, smp_processor_id());
138 139
	if (!netif_tx_queue_stopped(txq) &&
	    !netif_tx_queue_frozen(txq))
140
		ret = dev_hard_start_xmit(skb, dev, txq);
141
	HARD_TX_UNLOCK(dev, txq);
142

143
	spin_lock(root_lock);
144

145 146 147 148 149 150 151 152
	switch (ret) {
	case NETDEV_TX_OK:
		/* Driver sent out skb successfully */
		ret = qdisc_qlen(q);
		break;

	case NETDEV_TX_LOCKED:
		/* Driver try lock failed */
153
		ret = handle_dev_cpu_collision(skb, txq, q);
154 155 156 157 158 159 160 161
		break;

	default:
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
		if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
			printk(KERN_WARNING "BUG %s code %d qlen %d\n",
			       dev->name, ret, q->q.qlen);

162
		ret = dev_requeue_skb(skb, q);
163 164
		break;
	}
165

166 167
	if (ret && (netif_tx_queue_stopped(txq) ||
		    netif_tx_queue_frozen(txq)))
168 169
		ret = 0;

170
	return ret;
L
Linus Torvalds 已提交
171 172
}

173
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
174
{
175 176
	unsigned long start_time = jiffies;

177
	while (qdisc_restart(q)) {
178 179 180 181 182 183
		/*
		 * Postpone processing if
		 * 1. another process needs the CPU;
		 * 2. we've been doing it for too long.
		 */
		if (need_resched() || jiffies != start_time) {
184
			__netif_schedule(q);
185
			break;
186 187
		}
	}
H
Herbert Xu 已提交
188

189
	clear_bit(__QDISC_STATE_RUNNING, &q->state);
H
Herbert Xu 已提交
190 191
}

L
Linus Torvalds 已提交
192 193 194 195
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
196
	netif_tx_lock(dev);
197
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
198 199 200
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
201 202 203 204 205 206 207 208 209 210 211 212
			int some_queue_stopped = 0;
			unsigned int i;

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
				if (netif_tx_queue_stopped(txq)) {
					some_queue_stopped = 1;
					break;
				}
			}
213

214 215 216
			if (some_queue_stopped &&
			    time_after(jiffies, (dev->trans_start +
						 dev->watchdog_timeo))) {
217 218 219
				char drivername[64];
				printk(KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit timed out\n",
				       dev->name, netdev_drivername(dev, drivername, 64));
L
Linus Torvalds 已提交
220
				dev->tx_timeout(dev);
221
				WARN_ON_ONCE(1);
L
Linus Torvalds 已提交
222
			}
223 224 225
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
226 227 228
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
229
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
230 231 232 233 234 235 236 237 238

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
	if (dev->tx_timeout) {
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
239 240
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
241 242 243 244 245 246 247 248 249 250 251
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
252
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
253
	if (del_timer(&dev->watchdog_timer))
254
		dev_put(dev);
H
Herbert Xu 已提交
255
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
256 257
}

258 259 260 261 262 263
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
264 265
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
266
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
267
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
268 269 270
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
271
}
272
EXPORT_SYMBOL(netif_carrier_on);
273

274 275 276 277 278 279
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
280 281 282 283 284
void netif_carrier_off(struct net_device *dev)
{
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
		linkwatch_fire_event(dev);
}
285
EXPORT_SYMBOL(netif_carrier_off);
286

L
Linus Torvalds 已提交
287 288 289 290 291
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

292
static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
L
Linus Torvalds 已提交
293 294 295 296 297
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

298
static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
L
Linus Torvalds 已提交
299 300 301 302
{
	return NULL;
}

303
static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
L
Linus Torvalds 已提交
304 305
{
	if (net_ratelimit())
306 307
		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
		       skb->dev->name);
L
Linus Torvalds 已提交
308 309 310 311
	kfree_skb(skb);
	return NET_XMIT_CN;
}

312
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
313 314 315 316 317 318 319 320
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.requeue	=	noop_requeue,
	.owner		=	THIS_MODULE,
};

321 322 323 324
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
};

L
Linus Torvalds 已提交
325 326 327 328
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
329
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
330
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
331
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
332
	.dev_queue	=	&noop_netdev_queue,
L
Linus Torvalds 已提交
333
};
334
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
335

336
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
337 338 339 340 341 342 343 344
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.requeue	=	noop_requeue,
	.owner		=	THIS_MODULE,
};

345 346 347 348 349
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
	.qdisc		=	&noqueue_qdisc,
};

L
Linus Torvalds 已提交
350 351 352 353 354 355
static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
356 357
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
	.dev_queue	=	&noqueue_netdev_queue,
L
Linus Torvalds 已提交
358 359 360
};


361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377
static const u8 prio2band[TC_PRIO_MAX+1] =
	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
					     struct Qdisc *qdisc)
{
	struct sk_buff_head *list = qdisc_priv(qdisc);
	return list + prio2band[skb->priority & TC_PRIO_MAX];
}

static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
378
{
379
	struct sk_buff_head *list = prio2list(skb, qdisc);
L
Linus Torvalds 已提交
380

381 382
	if (skb_queue_len(list) < qdisc_dev(qdisc)->tx_queue_len) {
		qdisc->q.qlen++;
383
		return __qdisc_enqueue_tail(skb, qdisc, list);
384
	}
385 386

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
387 388
}

389
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
390
{
391 392
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);
L
Linus Torvalds 已提交
393

394 395 396 397 398 399
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		if (!skb_queue_empty(list + prio)) {
			qdisc->q.qlen--;
			return __qdisc_dequeue_head(qdisc, list + prio);
		}
	}
400

L
Linus Torvalds 已提交
401 402 403
	return NULL;
}

404
static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
L
Linus Torvalds 已提交
405
{
406 407
	qdisc->q.qlen++;
	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
L
Linus Torvalds 已提交
408 409
}

410
static void pfifo_fast_reset(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
411
{
412 413 414 415 416 417
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
		__qdisc_reset_queue(qdisc, list + prio);

418
	qdisc->qstats.backlog = 0;
419
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
420 421
}

422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
	NLA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
		skb_queue_head_init(list + prio);

	return 0;
}

static struct Qdisc_ops pfifo_fast_ops __read_mostly = {
	.id		=	"pfifo_fast",
	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
	.requeue	=	pfifo_fast_requeue,
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
454 455 456
	.owner		=	THIS_MODULE,
};

457
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
458
			  struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
459 460 461
{
	void *p;
	struct Qdisc *sch;
462 463
	unsigned int size;
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
464 465

	/* ensure that the Qdisc and the private data are 32-byte aligned */
466 467
	size = QDISC_ALIGN(sizeof(*sch));
	size += ops->priv_size + (QDISC_ALIGNTO - 1);
L
Linus Torvalds 已提交
468

469
	p = kzalloc(size, GFP_KERNEL);
L
Linus Torvalds 已提交
470
	if (!p)
471 472 473
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
	sch->padded = (char *) sch - (char *) p;
L
Linus Torvalds 已提交
474 475 476 477 478 479

	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
480
	sch->dev_queue = dev_queue;
481
	dev_hold(qdisc_dev(sch));
L
Linus Torvalds 已提交
482
	atomic_set(&sch->refcnt, 1);
483 484 485

	return sch;
errout:
486
	return ERR_PTR(err);
487 488
}

489 490 491
struct Qdisc * qdisc_create_dflt(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 struct Qdisc_ops *ops,
492
				 unsigned int parentid)
493 494
{
	struct Qdisc *sch;
495

496
	sch = qdisc_alloc(dev_queue, ops);
497 498
	if (IS_ERR(sch))
		goto errout;
499
	sch->parent = parentid;
500

L
Linus Torvalds 已提交
501 502 503
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

504
	qdisc_destroy(sch);
505
errout:
L
Linus Torvalds 已提交
506 507
	return NULL;
}
508
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
509

510
/* Under qdisc_root_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
511 512 513

void qdisc_reset(struct Qdisc *qdisc)
{
514
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
515 516 517 518

	if (ops->reset)
		ops->reset(qdisc);
}
519
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
520

521
/* this is the rcu callback function to clean up a qdisc when there
L
Linus Torvalds 已提交
522 523 524 525 526
 * are no further references to it */

static void __qdisc_destroy(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
527 528
	const struct Qdisc_ops  *ops = qdisc->ops;

529
#ifdef CONFIG_NET_SCHED
530
	qdisc_put_stab(qdisc->stab);
531
#endif
532 533 534 535 536 537 538 539 540
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

541 542
	kfree_skb(qdisc->gso_skb);

L
Linus Torvalds 已提交
543 544 545
	kfree((char *) qdisc - qdisc->padded);
}

546
/* Under qdisc_root_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
547 548 549 550

void qdisc_destroy(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN ||
551
	    !atomic_dec_and_test(&qdisc->refcnt))
L
Linus Torvalds 已提交
552 553
		return;

554 555
	if (qdisc->parent)
		list_del(&qdisc->list);
L
Linus Torvalds 已提交
556 557 558

	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
}
559
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
560

561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581
static bool dev_all_qdisc_sleeping_noop(struct net_device *dev)
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *txq = netdev_get_tx_queue(dev, i);

		if (txq->qdisc_sleeping != &noop_qdisc)
			return false;
	}
	return true;
}

static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
	struct Qdisc *qdisc;

	if (dev->tx_queue_len) {
		qdisc = qdisc_create_dflt(dev, dev_queue,
582
					  &pfifo_fast_ops, TC_H_ROOT);
583 584 585 586 587 588 589 590 591 592 593 594 595 596
		if (!qdisc) {
			printk(KERN_INFO "%s: activation failed\n", dev->name);
			return;
		}
	} else {
		qdisc =  &noqueue_qdisc;
	}
	dev_queue->qdisc_sleeping = qdisc;
}

static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
597
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
598 599
	int *need_watchdog_p = _need_watchdog;

600
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
601
	if (need_watchdog_p && new_qdisc != &noqueue_qdisc)
602 603 604
		*need_watchdog_p = 1;
}

L
Linus Torvalds 已提交
605 606
void dev_activate(struct net_device *dev)
{
607
	int need_watchdog;
608

L
Linus Torvalds 已提交
609
	/* No queueing discipline is attached to device;
610 611 612
	   create default one i.e. pfifo_fast for devices,
	   which need queueing and noqueue_qdisc for
	   virtual interfaces
L
Linus Torvalds 已提交
613 614
	 */

615 616
	if (dev_all_qdisc_sleeping_noop(dev))
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
L
Linus Torvalds 已提交
617

618 619 620 621
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

622 623
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
624
	transition_one_qdisc(dev, &dev->rx_queue, NULL);
625 626

	if (need_watchdog) {
L
Linus Torvalds 已提交
627 628 629
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
630 631
}

632 633 634
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
635
{
636
	struct Qdisc *qdisc_default = _qdisc_default;
637 638 639
	struct Qdisc *qdisc;

	qdisc = dev_queue->qdisc;
640
	if (qdisc) {
641 642
		spin_lock_bh(qdisc_lock(qdisc));

643 644
		dev_queue->qdisc = qdisc_default;
		qdisc_reset(qdisc);
645

646
		spin_unlock_bh(qdisc_lock(qdisc));
647
	}
L
Linus Torvalds 已提交
648 649
}

650 651 652 653 654 655
static bool some_qdisc_is_running(struct net_device *dev, int lock)
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
656
		spinlock_t *root_lock;
657
		struct Qdisc *q;
658 659 660
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
661
		q = dev_queue->qdisc;
662
		root_lock = qdisc_root_lock(q);
663 664

		if (lock)
665
			spin_lock_bh(root_lock);
666

667
		val = test_bit(__QDISC_STATE_RUNNING, &q->state);
668 669

		if (lock)
670
			spin_unlock_bh(root_lock);
671 672 673 674 675 676 677

		if (val)
			return true;
	}
	return false;
}

L
Linus Torvalds 已提交
678 679
void dev_deactivate(struct net_device *dev)
{
680
	bool running;
L
Linus Torvalds 已提交
681

682
	netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc);
683
	dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc);
684

L
Linus Torvalds 已提交
685 686
	dev_watchdog_down(dev);

687
	/* Wait for outstanding qdisc-less dev_queue_xmit calls. */
688
	synchronize_rcu();
L
Linus Torvalds 已提交
689

690
	/* Wait for outstanding qdisc_run calls. */
691
	do {
692
		while (some_qdisc_is_running(dev, 0))
693 694 695 696 697 698
			yield();

		/*
		 * Double-check inside queue lock to ensure that all effects
		 * of the queue run are visible when we return.
		 */
699
		running = some_qdisc_is_running(dev, 1);
700 701 702 703 704 705 706 707 708 709

		/*
		 * The running flag should never be set at this point because
		 * we've already set dev->qdisc to noop_qdisc *inside* the same
		 * pair of spin locks.  That is, if any qdisc_run starts after
		 * our initial test it should see the noop_qdisc and then
		 * clear the RUNNING bit before dropping the queue lock.  So
		 * if it is set here then we've found a bug.
		 */
	} while (WARN_ON_ONCE(running));
L
Linus Torvalds 已提交
710 711
}

712 713
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
714
				     void *_qdisc)
715
{
716 717
	struct Qdisc *qdisc = _qdisc;

718 719 720 721
	dev_queue->qdisc = qdisc;
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
722 723
void dev_init_scheduler(struct net_device *dev)
{
724
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
725
	dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
L
Linus Torvalds 已提交
726

727
	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
L
Linus Torvalds 已提交
728 729
}

730 731 732
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
733
{
734
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
735
	struct Qdisc *qdisc_default = _qdisc_default;
736 737

	if (qdisc) {
738 739
		spinlock_t *root_lock = qdisc_root_lock(qdisc);

740 741
		dev_queue->qdisc = qdisc_default;
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
742

743
		spin_lock_bh(root_lock);
L
Linus Torvalds 已提交
744
		qdisc_destroy(qdisc);
745
		spin_unlock_bh(root_lock);
746
	}
747 748 749 750
}

void dev_shutdown(struct net_device *dev)
{
751
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
752
	shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
753
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
754
}