sch_generic.c 22.8 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
30
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
31
#include <net/dst.h>
L
Linus Torvalds 已提交
32

33 34 35 36
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
37 38
/* Main transmission queue. */

39
/* Modifications to data participating in scheduling must be protected with
40
 * qdisc_lock(qdisc) spinlock.
41 42
 *
 * The idea is the following:
43 44
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
45
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
46 47
 */

48
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
49
{
E
Eric Dumazet 已提交
50
	skb_dst_force(skb);
51
	q->gso_skb = skb;
52
	q->qstats.requeues++;
53
	q->q.qlen++;	/* it's still part of the queue */
54
	__netif_schedule(q);
55

56 57 58
	return 0;
}

59
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
60
{
61
	struct sk_buff *skb = q->gso_skb;
62
	const struct netdev_queue *txq = q->dev_queue;
63

64 65
	if (unlikely(skb)) {
		/* check the reason of requeuing without tx lock first */
66
		txq = skb_get_tx_queue(txq->dev, skb);
67
		if (!netif_xmit_frozen_or_stopped(txq)) {
68
			q->gso_skb = NULL;
69 70
			q->q.qlen--;
		} else
71 72
			skb = NULL;
	} else {
73
		if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq)) {
74
			skb = q->dequeue(q);
75 76 77
			if (skb)
				skb = validate_xmit_skb(skb, qdisc_dev(q));
		}
78
	}
79 80 81 82

	return skb;
}

83
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
84
					   struct netdev_queue *dev_queue,
85
					   struct Qdisc *q)
86
{
87
	int ret;
88

89
	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
90 91 92 93 94 95
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
96
		kfree_skb_list(skb);
97 98
		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
				     dev_queue->dev->name);
99 100 101 102 103 104
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
E
Eric Dumazet 已提交
105
		__this_cpu_inc(softnet_data.cpu_collision);
106
		ret = dev_requeue_skb(skb, q);
107 108
	}

109
	return ret;
110 111
}

112
/*
113 114 115
 * Transmit possibly several skbs, and handle the return status as
 * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
 * only one CPU can execute this function.
116 117 118 119 120
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 */
121 122 123
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		    struct net_device *dev, struct netdev_queue *txq,
		    spinlock_t *root_lock)
L
Linus Torvalds 已提交
124
{
125
	int ret = NETDEV_TX_BUSY;
126 127 128

	/* And release qdisc */
	spin_unlock(root_lock);
129

130
	HARD_TX_LOCK(dev, txq, smp_processor_id());
131
	if (!netif_xmit_frozen_or_stopped(txq))
132
		skb = dev_hard_start_xmit(skb, dev, txq, &ret);
133

134
	HARD_TX_UNLOCK(dev, txq);
135

136
	spin_lock(root_lock);
137

138 139
	if (dev_xmit_complete(ret)) {
		/* Driver sent out skb successfully or skb was consumed */
140
		ret = qdisc_qlen(q);
141
	} else if (ret == NETDEV_TX_LOCKED) {
142
		/* Driver try lock failed */
143
		ret = handle_dev_cpu_collision(skb, txq, q);
144
	} else {
145
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
146 147 148
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
149

150
		ret = dev_requeue_skb(skb, q);
151
	}
152

153
	if (ret && netif_xmit_frozen_or_stopped(txq))
154 155
		ret = 0;

156
	return ret;
L
Linus Torvalds 已提交
157 158
}

159 160 161
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
Y
Ying Xue 已提交
162
 * __QDISC___STATE_RUNNING guarantees only one CPU can process
163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
static inline int qdisc_restart(struct Qdisc *q)
{
	struct netdev_queue *txq;
	struct net_device *dev;
	spinlock_t *root_lock;
	struct sk_buff *skb;

	/* Dequeue packet */
	skb = dequeue_skb(q);
	if (unlikely(!skb))
		return 0;
189

E
Eric Dumazet 已提交
190
	WARN_ON_ONCE(skb_dst_is_noref(skb));
191

192 193
	root_lock = qdisc_lock(q);
	dev = qdisc_dev(q);
194
	txq = skb_get_tx_queue(dev, skb);
195 196 197 198

	return sch_direct_xmit(skb, q, dev, txq, root_lock);
}

199
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
200
{
J
jamal 已提交
201
	int quota = weight_p;
202

203
	while (qdisc_restart(q)) {
204
		/*
J
jamal 已提交
205 206 207
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
208
		 */
209
		if (--quota <= 0 || need_resched()) {
210
			__netif_schedule(q);
211
			break;
212 213
		}
	}
H
Herbert Xu 已提交
214

215
	qdisc_run_end(q);
H
Herbert Xu 已提交
216 217
}

218 219
unsigned long dev_trans_start(struct net_device *dev)
{
220
	unsigned long val, res;
221 222
	unsigned int i;

223 224 225
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
	res = dev->trans_start;
226 227 228 229 230 231
	for (i = 0; i < dev->num_tx_queues; i++) {
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
	dev->trans_start = res;
232

233 234 235 236
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

L
Linus Torvalds 已提交
237 238 239 240
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
241
	netif_tx_lock(dev);
242
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
243 244 245
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
246
			int some_queue_timedout = 0;
247
			unsigned int i;
248
			unsigned long trans_start;
249 250 251 252 253

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
254 255 256 257
				/*
				 * old device drivers set dev->trans_start
				 */
				trans_start = txq->trans_start ? : dev->trans_start;
258
				if (netif_xmit_stopped(txq) &&
259 260 261
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
262
					txq->trans_timeout++;
263 264 265
					break;
				}
			}
266

267 268
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
269
				       dev->name, netdev_drivername(dev), i);
270
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
271
			}
272 273 274
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
275 276 277
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
278
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
279 280 281 282 283 284

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
285
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
286 287
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
288 289
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
290 291 292 293 294 295 296 297 298 299 300
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
301
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
302
	if (del_timer(&dev->watchdog_timer))
303
		dev_put(dev);
H
Herbert Xu 已提交
304
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
305 306
}

307 308 309 310 311 312
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
313 314
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
315
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
316 317
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
318
		atomic_inc(&dev->carrier_changes);
319
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
320 321 322
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
323
}
324
EXPORT_SYMBOL(netif_carrier_on);
325

326 327 328 329 330 331
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
332 333
void netif_carrier_off(struct net_device *dev)
{
334 335 336
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
337
		atomic_inc(&dev->carrier_changes);
338
		linkwatch_fire_event(dev);
339
	}
340
}
341
EXPORT_SYMBOL(netif_carrier_off);
342

L
Linus Torvalds 已提交
343 344 345 346 347
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

348
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
L
Linus Torvalds 已提交
349 350 351 352 353
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

354
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
355 356 357 358
{
	return NULL;
}

359
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
360 361 362 363
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
364
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
365 366 367
	.owner		=	THIS_MODULE,
};

368 369
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
370
	.qdisc_sleeping	=	&noop_qdisc,
371 372
};

L
Linus Torvalds 已提交
373 374 375 376
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
377
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
378
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
379
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
380
	.dev_queue	=	&noop_netdev_queue,
381
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
382
};
383
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
384

385
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
386 387 388 389
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
390
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
391 392 393
	.owner		=	THIS_MODULE,
};

394 395 396
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
	.qdisc		=	&noqueue_qdisc,
397
	.qdisc_sleeping	=	&noqueue_qdisc,
398 399
};

L
Linus Torvalds 已提交
400 401 402 403 404 405
static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
406 407
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
	.dev_queue	=	&noqueue_netdev_queue,
408
	.busylock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
L
Linus Torvalds 已提交
409 410 411
};


E
Eric Dumazet 已提交
412 413 414
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
415 416 417 418 419 420 421

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
	struct sk_buff_head q[PFIFO_FAST_BANDS];
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
					     int band)
442
{
443
	return priv->q + band;
444 445
}

E
Eric Dumazet 已提交
446
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
447
{
448 449 450 451
	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
		struct sk_buff_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
452

453
		priv->bitmap |= (1 << band);
454
		qdisc->q.qlen++;
455
		return __qdisc_enqueue_tail(skb, qdisc, list);
456
	}
457 458

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
459 460
}

E
Eric Dumazet 已提交
461
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
462
{
463 464
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
465

466 467 468 469 470 471 472 473 474
	if (likely(band >= 0)) {
		struct sk_buff_head *list = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);

		qdisc->q.qlen--;
		if (skb_queue_empty(list))
			priv->bitmap &= ~(1 << band);

		return skb;
475
	}
476

L
Linus Torvalds 已提交
477 478 479
	return NULL;
}

E
Eric Dumazet 已提交
480
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
481
{
482 483 484 485 486
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
		struct sk_buff_head *list = band2list(priv, band);
487

488
		return skb_peek(list);
489 490 491 492 493
	}

	return NULL;
}

E
Eric Dumazet 已提交
494
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
495
{
496
	int prio;
497
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
498 499

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
500
		__qdisc_reset_queue(qdisc, band2list(priv, prio));
501

502
	priv->bitmap = 0;
503
	qdisc->qstats.backlog = 0;
504
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
505 506
}

507 508 509 510
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
511
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
512 513
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
514 515 516 517 518 519 520 521 522
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
523
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
524 525

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
526
		skb_queue_head_init(band2list(priv, prio));
527

528 529
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
530 531 532
	return 0;
}

533
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
534
	.id		=	"pfifo_fast",
535
	.priv_size	=	sizeof(struct pfifo_fast_priv),
536 537
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
538
	.peek		=	pfifo_fast_peek,
539 540 541
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
542 543 544
	.owner		=	THIS_MODULE,
};

545 546
static struct lock_class_key qdisc_tx_busylock;

547
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
548
			  const struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
549 550 551
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
552
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
553
	int err = -ENOBUFS;
554
	struct net_device *dev = dev_queue->dev;
L
Linus Torvalds 已提交
555

556 557 558
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
559
	if (!p)
560 561
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
562 563 564 565 566 567 568 569 570 571
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
L
Linus Torvalds 已提交
572 573
	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
574

575
	spin_lock_init(&sch->busylock);
576 577 578
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

L
Linus Torvalds 已提交
579 580 581
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
582
	sch->dev_queue = dev_queue;
583
	dev_hold(dev);
L
Linus Torvalds 已提交
584
	atomic_set(&sch->refcnt, 1);
585 586 587

	return sch;
errout:
588
	return ERR_PTR(err);
589 590
}

591
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
592 593
				const struct Qdisc_ops *ops,
				unsigned int parentid)
594 595
{
	struct Qdisc *sch;
596

597 598 599
	if (!try_module_get(ops->owner))
		goto errout;

600
	sch = qdisc_alloc(dev_queue, ops);
601 602
	if (IS_ERR(sch))
		goto errout;
603
	sch->parent = parentid;
604

L
Linus Torvalds 已提交
605 606 607
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

608
	qdisc_destroy(sch);
609
errout:
L
Linus Torvalds 已提交
610 611
	return NULL;
}
612
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
613

614
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
615 616 617

void qdisc_reset(struct Qdisc *qdisc)
{
618
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
619 620 621

	if (ops->reset)
		ops->reset(qdisc);
622

623
	if (qdisc->gso_skb) {
624
		kfree_skb_list(qdisc->gso_skb);
625 626 627
		qdisc->gso_skb = NULL;
		qdisc->q.qlen = 0;
	}
L
Linus Torvalds 已提交
628
}
629
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
630

E
Eric Dumazet 已提交
631 632 633 634 635 636 637
static void qdisc_rcu_free(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);

	kfree((char *) qdisc - qdisc->padded);
}

638
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
639
{
640 641
	const struct Qdisc_ops  *ops = qdisc->ops;

642 643 644 645
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !atomic_dec_and_test(&qdisc->refcnt))
		return;

646
#ifdef CONFIG_NET_SCHED
647 648
	qdisc_list_del(qdisc);

E
Eric Dumazet 已提交
649
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
650
#endif
651 652 653 654 655 656 657 658 659
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

660
	kfree_skb_list(qdisc->gso_skb);
E
Eric Dumazet 已提交
661 662 663 664 665
	/*
	 * gen_estimator est_timer() might access qdisc->q.lock,
	 * wait a RCU grace period before freeing qdisc.
	 */
	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
L
Linus Torvalds 已提交
666
}
667
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
668

669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
693
EXPORT_SYMBOL(dev_graft_qdisc);
694

695 696 697 698
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
E
Eric Dumazet 已提交
699
	struct Qdisc *qdisc = &noqueue_qdisc;
700 701

	if (dev->tx_queue_len) {
702
		qdisc = qdisc_create_dflt(dev_queue,
703
					  default_qdisc_ops, TC_H_ROOT);
704
		if (!qdisc) {
E
Eric Dumazet 已提交
705
			netdev_info(dev, "activation failed\n");
706 707
			return;
		}
708 709
		if (!netif_is_multiqueue(dev))
			qdisc->flags |= TCQ_F_ONETXQUEUE;
710 711 712 713
	}
	dev_queue->qdisc_sleeping = qdisc;
}

714 715 716 717 718 719 720 721 722 723 724 725
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		atomic_inc(&dev->qdisc->refcnt);
	} else {
726
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
727 728
		if (qdisc) {
			dev->qdisc = qdisc;
729
			qdisc->ops->attach(qdisc);
730 731 732 733
		}
	}
}

734 735 736 737
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
738
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
739 740
	int *need_watchdog_p = _need_watchdog;

741 742 743
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

744
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
745 746
	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
		dev_queue->trans_start = 0;
747
		*need_watchdog_p = 1;
748
	}
749 750
}

L
Linus Torvalds 已提交
751 752
void dev_activate(struct net_device *dev)
{
753
	int need_watchdog;
754

L
Linus Torvalds 已提交
755
	/* No queueing discipline is attached to device;
756 757
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
758 759
	 */

760 761
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
762

763 764 765 766
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

767 768
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
769 770
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
771 772

	if (need_watchdog) {
L
Linus Torvalds 已提交
773 774 775
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
776
}
777
EXPORT_SYMBOL(dev_activate);
778

779 780 781
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
782
{
783
	struct Qdisc *qdisc_default = _qdisc_default;
784 785
	struct Qdisc *qdisc;

786
	qdisc = rtnl_dereference(dev_queue->qdisc);
787
	if (qdisc) {
788 789
		spin_lock_bh(qdisc_lock(qdisc));

790 791 792
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

793
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
794
		qdisc_reset(qdisc);
795

796
		spin_unlock_bh(qdisc_lock(qdisc));
797
	}
L
Linus Torvalds 已提交
798 799
}

800
static bool some_qdisc_is_busy(struct net_device *dev)
801 802 803 804 805
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
806
		spinlock_t *root_lock;
807
		struct Qdisc *q;
808 809 810
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
811
		q = dev_queue->qdisc_sleeping;
812
		root_lock = qdisc_lock(q);
813

814
		spin_lock_bh(root_lock);
815

816
		val = (qdisc_is_running(q) ||
817
		       test_bit(__QDISC_STATE_SCHED, &q->state));
818

819
		spin_unlock_bh(root_lock);
820 821 822 823 824 825 826

		if (val)
			return true;
	}
	return false;
}

827 828 829 830 831 832 833
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
834
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
835
{
836
	struct net_device *dev;
837
	bool sync_needed = false;
838

839
	list_for_each_entry(dev, head, close_list) {
840 841 842 843 844 845 846
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
847
		sync_needed |= !dev->dismantle;
848
	}
L
Linus Torvalds 已提交
849

850 851 852 853 854 855
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
	if (sync_needed)
		synchronize_net();
L
Linus Torvalds 已提交
856

857
	/* Wait for outstanding qdisc_run calls. */
858
	list_for_each_entry(dev, head, close_list)
859 860 861 862 863 864 865 866
		while (some_qdisc_is_busy(dev))
			yield();
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

867
	list_add(&dev->close_list, &single);
868
	dev_deactivate_many(&single);
869
	list_del(&single);
L
Linus Torvalds 已提交
870
}
871
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
872

873 874
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
875
				     void *_qdisc)
876
{
877 878
	struct Qdisc *qdisc = _qdisc;

879
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
880 881 882
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
883 884
void dev_init_scheduler(struct net_device *dev)
{
885
	dev->qdisc = &noop_qdisc;
886
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
887 888
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
889

890
	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
L
Linus Torvalds 已提交
891 892
}

893 894 895
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
896
{
897
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
898
	struct Qdisc *qdisc_default = _qdisc_default;
899 900

	if (qdisc) {
901
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
902
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
903 904

		qdisc_destroy(qdisc);
905
	}
906 907 908 909
}

void dev_shutdown(struct net_device *dev)
{
910
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
911 912
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
913 914 915
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

916
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
917
}
918

919
void psched_ratecfg_precompute(struct psched_ratecfg *r,
920 921
			       const struct tc_ratespec *conf,
			       u64 rate64)
922
{
923 924
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
925
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
926
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
927 928
	r->mult = 1;
	/*
929 930 931 932 933 934 935 936 937 938 939
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
940
	 */
941 942 943 944 945 946
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
947
				break;
948 949
			factor <<= 1;
			r->shift++;
950 951 952 953
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);