sch_generic.c 22.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
30
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
31
#include <net/dst.h>
L
Linus Torvalds 已提交
32

33 34 35 36
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
37 38
/* Main transmission queue. */

39
/* Modifications to data participating in scheduling must be protected with
40
 * qdisc_lock(qdisc) spinlock.
41 42
 *
 * The idea is the following:
43 44
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
45
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
46 47
 */

48
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
49
{
E
Eric Dumazet 已提交
50
	skb_dst_force(skb);
51
	q->gso_skb = skb;
52
	q->qstats.requeues++;
53
	q->q.qlen++;	/* it's still part of the queue */
54
	__netif_schedule(q);
55

56 57 58
	return 0;
}

59
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
60
{
61
	struct sk_buff *skb = q->gso_skb;
62
	const struct netdev_queue *txq = q->dev_queue;
63

64 65
	if (unlikely(skb)) {
		/* check the reason of requeuing without tx lock first */
66
		txq = skb_get_tx_queue(txq->dev, skb);
67
		if (!netif_xmit_frozen_or_stopped(txq)) {
68
			q->gso_skb = NULL;
69 70
			q->q.qlen--;
		} else
71 72
			skb = NULL;
	} else {
73
		if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) {
74
			skb = q->dequeue(q);
75 76 77
			if (skb)
				skb = validate_xmit_skb(skb, qdisc_dev(q));
		}
78
	}
79 80 81 82

	return skb;
}

83
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
84
					   struct netdev_queue *dev_queue,
85
					   struct Qdisc *q)
86
{
87
	int ret;
88

89
	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
90 91 92 93 94 95
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
96
		kfree_skb_list(skb);
97 98
		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
				     dev_queue->dev->name);
99 100 101 102 103 104
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
E
Eric Dumazet 已提交
105
		__this_cpu_inc(softnet_data.cpu_collision);
106
		ret = dev_requeue_skb(skb, q);
107 108
	}

109
	return ret;
110 111
}

112
/*
113 114 115
 * Transmit possibly several skbs, and handle the return status as
 * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
 * only one CPU can execute this function.
116 117 118 119 120
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 */
121 122 123
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		    struct net_device *dev, struct netdev_queue *txq,
		    spinlock_t *root_lock)
L
Linus Torvalds 已提交
124
{
125
	int ret = NETDEV_TX_BUSY;
126 127 128

	/* And release qdisc */
	spin_unlock(root_lock);
129

130
	HARD_TX_LOCK(dev, txq, smp_processor_id());
131
	if (!netif_xmit_frozen_or_stopped(txq))
132
		skb = dev_hard_start_xmit(skb, dev, txq, &ret);
133

134
	HARD_TX_UNLOCK(dev, txq);
135

136
	spin_lock(root_lock);
137

138 139
	if (dev_xmit_complete(ret)) {
		/* Driver sent out skb successfully or skb was consumed */
140
		ret = qdisc_qlen(q);
141
	} else if (ret == NETDEV_TX_LOCKED) {
142
		/* Driver try lock failed */
143
		ret = handle_dev_cpu_collision(skb, txq, q);
144
	} else {
145
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
146 147 148
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
149

150
		ret = dev_requeue_skb(skb, q);
151
	}
152

153
	if (ret && netif_xmit_frozen_or_stopped(txq))
154 155
		ret = 0;

156
	return ret;
L
Linus Torvalds 已提交
157 158
}

159 160 161
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
Y
Ying Xue 已提交
162
 * __QDISC___STATE_RUNNING guarantees only one CPU can process
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
static inline int qdisc_restart(struct Qdisc *q)
{
	struct netdev_queue *txq;
	struct net_device *dev;
	spinlock_t *root_lock;
	struct sk_buff *skb;

	/* Dequeue packet */
	skb = dequeue_skb(q);
	if (unlikely(!skb))
		return 0;
189

E
Eric Dumazet 已提交
190
	WARN_ON_ONCE(skb_dst_is_noref(skb));
191

192 193
	root_lock = qdisc_lock(q);
	dev = qdisc_dev(q);
194
	txq = skb_get_tx_queue(dev, skb);
195 196 197 198

	return sch_direct_xmit(skb, q, dev, txq, root_lock);
}

199
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
200
{
J
jamal 已提交
201
	int quota = weight_p;
202

203
	while (qdisc_restart(q)) {
204
		/*
J
jamal 已提交
205 206 207
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
208
		 */
209
		if (--quota <= 0 || need_resched()) {
210
			__netif_schedule(q);
211
			break;
212 213
		}
	}
H
Herbert Xu 已提交
214

215
	qdisc_run_end(q);
H
Herbert Xu 已提交
216 217
}

218 219
unsigned long dev_trans_start(struct net_device *dev)
{
220
	unsigned long val, res;
221 222
	unsigned int i;

223 224 225
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
	res = dev->trans_start;
226 227 228 229 230 231
	for (i = 0; i < dev->num_tx_queues; i++) {
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
	dev->trans_start = res;
232

233 234 235 236
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

L
Linus Torvalds 已提交
237 238 239 240
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
241
	netif_tx_lock(dev);
242
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
243 244 245
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
246
			int some_queue_timedout = 0;
247
			unsigned int i;
248
			unsigned long trans_start;
249 250 251 252 253

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
254 255 256 257
				/*
				 * old device drivers set dev->trans_start
				 */
				trans_start = txq->trans_start ? : dev->trans_start;
258
				if (netif_xmit_stopped(txq) &&
259 260 261
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
262
					txq->trans_timeout++;
263 264 265
					break;
				}
			}
266

267 268
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
269
				       dev->name, netdev_drivername(dev), i);
270
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
271
			}
272 273 274
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
275 276 277
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
278
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
279 280 281 282 283 284

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
285
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
286 287
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
288 289
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
290 291 292 293 294 295 296 297 298 299 300
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
301
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
302
	if (del_timer(&dev->watchdog_timer))
303
		dev_put(dev);
H
Herbert Xu 已提交
304
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
305 306
}

307 308 309 310 311 312
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
313 314
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
315
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
316 317
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
318
		atomic_inc(&dev->carrier_changes);
319
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
320 321 322
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
323
}
324
EXPORT_SYMBOL(netif_carrier_on);
325

326 327 328 329 330 331
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
332 333
void netif_carrier_off(struct net_device *dev)
{
334 335 336
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
337
		atomic_inc(&dev->carrier_changes);
338
		linkwatch_fire_event(dev);
339
	}
340
}
341
EXPORT_SYMBOL(netif_carrier_off);
342

L
Linus Torvalds 已提交
343 344 345 346 347
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

348
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
L
Linus Torvalds 已提交
349 350 351 352 353
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

354
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
355 356 357 358
{
	return NULL;
}

359
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
360 361 362 363
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
364
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
365 366 367
	.owner		=	THIS_MODULE,
};

368 369
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
370
	.qdisc_sleeping	=	&noop_qdisc,
371 372
};

L
Linus Torvalds 已提交
373 374 375 376
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
377
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
378
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
379
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
380
	.dev_queue	=	&noop_netdev_queue,
381
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
382
};
383
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
384

385
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
386 387 388 389
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
390
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
391 392 393
	.owner		=	THIS_MODULE,
};

394 395 396
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
	.qdisc		=	&noqueue_qdisc,
397
	.qdisc_sleeping	=	&noqueue_qdisc,
398 399
};

L
Linus Torvalds 已提交
400 401 402 403 404 405
static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
406 407
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
	.dev_queue	=	&noqueue_netdev_queue,
408
	.busylock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
L
Linus Torvalds 已提交
409 410 411
};


E
Eric Dumazet 已提交
412 413 414
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
415 416 417 418 419 420 421

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
	struct sk_buff_head q[PFIFO_FAST_BANDS];
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
					     int band)
442
{
443
	return priv->q + band;
444 445
}

E
Eric Dumazet 已提交
446
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
447
{
448 449 450 451
	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
		struct sk_buff_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
452

453
		priv->bitmap |= (1 << band);
454
		qdisc->q.qlen++;
455
		return __qdisc_enqueue_tail(skb, qdisc, list);
456
	}
457 458

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
459 460
}

E
Eric Dumazet 已提交
461
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
462
{
463 464
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
465

466 467 468 469 470 471 472 473 474
	if (likely(band >= 0)) {
		struct sk_buff_head *list = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);

		qdisc->q.qlen--;
		if (skb_queue_empty(list))
			priv->bitmap &= ~(1 << band);

		return skb;
475
	}
476

L
Linus Torvalds 已提交
477 478 479
	return NULL;
}

E
Eric Dumazet 已提交
480
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
481
{
482 483 484 485 486
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
		struct sk_buff_head *list = band2list(priv, band);
487

488
		return skb_peek(list);
489 490 491 492 493
	}

	return NULL;
}

E
Eric Dumazet 已提交
494
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
495
{
496
	int prio;
497
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
498 499

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
500
		__qdisc_reset_queue(qdisc, band2list(priv, prio));
501

502
	priv->bitmap = 0;
503
	qdisc->qstats.backlog = 0;
504
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
505 506
}

507 508 509 510
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
511
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
512 513
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
514 515 516 517 518 519 520 521 522
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
523
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
524 525

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
526
		__skb_queue_head_init(band2list(priv, prio));
527

528 529
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
530 531 532
	return 0;
}

533
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
534
	.id		=	"pfifo_fast",
535
	.priv_size	=	sizeof(struct pfifo_fast_priv),
536 537
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
538
	.peek		=	pfifo_fast_peek,
539 540 541
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
542 543 544
	.owner		=	THIS_MODULE,
};

545 546
static struct lock_class_key qdisc_tx_busylock;

547
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
548
			  const struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
549 550 551
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
552
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
553
	int err = -ENOBUFS;
554
	struct net_device *dev = dev_queue->dev;
L
Linus Torvalds 已提交
555

556 557 558
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
559
	if (!p)
560 561
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
562 563 564 565 566 567 568 569 570 571
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
L
Linus Torvalds 已提交
572 573
	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
574

575
	spin_lock_init(&sch->busylock);
576 577 578
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

L
Linus Torvalds 已提交
579 580 581
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
582
	sch->dev_queue = dev_queue;
583
	dev_hold(dev);
L
Linus Torvalds 已提交
584
	atomic_set(&sch->refcnt, 1);
585 586 587

	return sch;
errout:
588
	return ERR_PTR(err);
589 590
}

591
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
592 593
				const struct Qdisc_ops *ops,
				unsigned int parentid)
594 595
{
	struct Qdisc *sch;
596

597 598 599
	if (!try_module_get(ops->owner))
		goto errout;

600
	sch = qdisc_alloc(dev_queue, ops);
601 602
	if (IS_ERR(sch))
		goto errout;
603
	sch->parent = parentid;
604

L
Linus Torvalds 已提交
605 606 607
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

608
	qdisc_destroy(sch);
609
errout:
L
Linus Torvalds 已提交
610 611
	return NULL;
}
612
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
613

614
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
615 616 617

void qdisc_reset(struct Qdisc *qdisc)
{
618
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
619 620 621

	if (ops->reset)
		ops->reset(qdisc);
622

623
	if (qdisc->gso_skb) {
624
		kfree_skb_list(qdisc->gso_skb);
625 626 627
		qdisc->gso_skb = NULL;
		qdisc->q.qlen = 0;
	}
L
Linus Torvalds 已提交
628
}
629
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
630

E
Eric Dumazet 已提交
631 632 633 634
static void qdisc_rcu_free(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);

635 636 637
	if (qdisc_is_percpu_stats(qdisc))
		free_percpu(qdisc->cpu_bstats);

E
Eric Dumazet 已提交
638 639 640
	kfree((char *) qdisc - qdisc->padded);
}

641
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
642
{
643 644
	const struct Qdisc_ops  *ops = qdisc->ops;

645 646 647 648
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !atomic_dec_and_test(&qdisc->refcnt))
		return;

649
#ifdef CONFIG_NET_SCHED
650 651
	qdisc_list_del(qdisc);

E
Eric Dumazet 已提交
652
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
653
#endif
654 655 656 657 658 659 660 661 662
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

663
	kfree_skb_list(qdisc->gso_skb);
E
Eric Dumazet 已提交
664 665 666 667 668
	/*
	 * gen_estimator est_timer() might access qdisc->q.lock,
	 * wait a RCU grace period before freeing qdisc.
	 */
	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
L
Linus Torvalds 已提交
669
}
670
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
671

672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
696
EXPORT_SYMBOL(dev_graft_qdisc);
697

698 699 700 701
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
E
Eric Dumazet 已提交
702
	struct Qdisc *qdisc = &noqueue_qdisc;
703 704

	if (dev->tx_queue_len) {
705
		qdisc = qdisc_create_dflt(dev_queue,
706
					  default_qdisc_ops, TC_H_ROOT);
707
		if (!qdisc) {
E
Eric Dumazet 已提交
708
			netdev_info(dev, "activation failed\n");
709 710
			return;
		}
711 712
		if (!netif_is_multiqueue(dev))
			qdisc->flags |= TCQ_F_ONETXQUEUE;
713 714 715 716
	}
	dev_queue->qdisc_sleeping = qdisc;
}

717 718 719 720 721 722 723 724 725 726 727 728
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		atomic_inc(&dev->qdisc->refcnt);
	} else {
729
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
730 731
		if (qdisc) {
			dev->qdisc = qdisc;
732
			qdisc->ops->attach(qdisc);
733 734 735 736
		}
	}
}

737 738 739 740
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
741
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
742 743
	int *need_watchdog_p = _need_watchdog;

744 745 746
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

747
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
748 749
	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
		dev_queue->trans_start = 0;
750
		*need_watchdog_p = 1;
751
	}
752 753
}

L
Linus Torvalds 已提交
754 755
void dev_activate(struct net_device *dev)
{
756
	int need_watchdog;
757

L
Linus Torvalds 已提交
758
	/* No queueing discipline is attached to device;
759 760
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
761 762
	 */

763 764
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
765

766 767 768 769
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

770 771
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
772 773
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
774 775

	if (need_watchdog) {
L
Linus Torvalds 已提交
776 777 778
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
779
}
780
EXPORT_SYMBOL(dev_activate);
781

782 783 784
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
785
{
786
	struct Qdisc *qdisc_default = _qdisc_default;
787 788
	struct Qdisc *qdisc;

789
	qdisc = rtnl_dereference(dev_queue->qdisc);
790
	if (qdisc) {
791 792
		spin_lock_bh(qdisc_lock(qdisc));

793 794 795
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

796
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
797
		qdisc_reset(qdisc);
798

799
		spin_unlock_bh(qdisc_lock(qdisc));
800
	}
L
Linus Torvalds 已提交
801 802
}

803
static bool some_qdisc_is_busy(struct net_device *dev)
804 805 806 807 808
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
809
		spinlock_t *root_lock;
810
		struct Qdisc *q;
811 812 813
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
814
		q = dev_queue->qdisc_sleeping;
815
		root_lock = qdisc_lock(q);
816

817
		spin_lock_bh(root_lock);
818

819
		val = (qdisc_is_running(q) ||
820
		       test_bit(__QDISC_STATE_SCHED, &q->state));
821

822
		spin_unlock_bh(root_lock);
823 824 825 826 827 828 829

		if (val)
			return true;
	}
	return false;
}

830 831 832 833 834 835 836
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
837
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
838
{
839
	struct net_device *dev;
840
	bool sync_needed = false;
841

842
	list_for_each_entry(dev, head, close_list) {
843 844 845 846 847 848 849
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
850
		sync_needed |= !dev->dismantle;
851
	}
L
Linus Torvalds 已提交
852

853 854 855 856 857 858
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
	if (sync_needed)
		synchronize_net();
L
Linus Torvalds 已提交
859

860
	/* Wait for outstanding qdisc_run calls. */
861
	list_for_each_entry(dev, head, close_list)
862 863 864 865 866 867 868 869
		while (some_qdisc_is_busy(dev))
			yield();
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

870
	list_add(&dev->close_list, &single);
871
	dev_deactivate_many(&single);
872
	list_del(&single);
L
Linus Torvalds 已提交
873
}
874
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
875

876 877
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
878
				     void *_qdisc)
879
{
880 881
	struct Qdisc *qdisc = _qdisc;

882
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
883 884 885
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
886 887
void dev_init_scheduler(struct net_device *dev)
{
888
	dev->qdisc = &noop_qdisc;
889
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
890 891
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
892

893
	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
L
Linus Torvalds 已提交
894 895
}

896 897 898
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
899
{
900
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
901
	struct Qdisc *qdisc_default = _qdisc_default;
902 903

	if (qdisc) {
904
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
905
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
906 907

		qdisc_destroy(qdisc);
908
	}
909 910 911 912
}

void dev_shutdown(struct net_device *dev)
{
913
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
914 915
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
916 917 918
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

919
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
920
}
921

922
void psched_ratecfg_precompute(struct psched_ratecfg *r,
923 924
			       const struct tc_ratespec *conf,
			       u64 rate64)
925
{
926 927
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
928
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
929
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
930 931
	r->mult = 1;
	/*
932 933 934 935 936 937 938 939 940 941 942
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
943
	 */
944 945 946 947 948 949
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
950
				break;
951 952
			factor <<= 1;
			r->shift++;
953 954 955 956
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);