sch_generic.c 22.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
30
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
31
#include <net/dst.h>
L
Linus Torvalds 已提交
32

33 34 35 36
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
37 38
/* Main transmission queue. */

39
/* Modifications to data participating in scheduling must be protected with
40
 * qdisc_lock(qdisc) spinlock.
41 42
 *
 * The idea is the following:
43 44
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
45
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
46 47
 */

48
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
49
{
E
Eric Dumazet 已提交
50
	skb_dst_force(skb);
51
	q->gso_skb = skb;
52
	q->qstats.requeues++;
53
	q->q.qlen++;	/* it's still part of the queue */
54
	__netif_schedule(q);
55

56 57 58
	return 0;
}

59
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
60
{
61
	struct sk_buff *skb = q->gso_skb;
62
	const struct netdev_queue *txq = q->dev_queue;
63

64 65
	if (unlikely(skb)) {
		/* check the reason of requeuing without tx lock first */
66
		txq = netdev_get_tx_queue(txq->dev, skb_get_queue_mapping(skb));
67
		if (!netif_xmit_frozen_or_stopped(txq)) {
68
			q->gso_skb = NULL;
69 70
			q->q.qlen--;
		} else
71 72
			skb = NULL;
	} else {
73 74
		if (!(q->flags & TCQ_F_ONETXQUEUE) || !netif_xmit_frozen_or_stopped(txq))
			skb = q->dequeue(q);
75
	}
76 77 78 79

	return skb;
}

80
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
81
					   struct netdev_queue *dev_queue,
82
					   struct Qdisc *q)
83
{
84
	int ret;
85

86
	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
87 88 89 90 91 92
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
93
		kfree_skb(skb);
94 95
		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
				     dev_queue->dev->name);
96 97 98 99 100 101
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
E
Eric Dumazet 已提交
102
		__this_cpu_inc(softnet_data.cpu_collision);
103
		ret = dev_requeue_skb(skb, q);
104 105
	}

106
	return ret;
107 108
}

109
/*
110
 * Transmit one skb, and handle the return status as required. Holding the
Y
Ying Xue 已提交
111
 * __QDISC___STATE_RUNNING bit guarantees that only one CPU can execute this
112
 * function.
113 114 115 116 117
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 */
118 119 120
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		    struct net_device *dev, struct netdev_queue *txq,
		    spinlock_t *root_lock)
L
Linus Torvalds 已提交
121
{
122
	int ret = NETDEV_TX_BUSY;
123 124 125

	/* And release qdisc */
	spin_unlock(root_lock);
126

127
	HARD_TX_LOCK(dev, txq, smp_processor_id());
128
	if (!netif_xmit_frozen_or_stopped(txq))
129
		ret = dev_hard_start_xmit(skb, dev, txq);
130

131
	HARD_TX_UNLOCK(dev, txq);
132

133
	spin_lock(root_lock);
134

135 136
	if (dev_xmit_complete(ret)) {
		/* Driver sent out skb successfully or skb was consumed */
137
		ret = qdisc_qlen(q);
138
	} else if (ret == NETDEV_TX_LOCKED) {
139
		/* Driver try lock failed */
140
		ret = handle_dev_cpu_collision(skb, txq, q);
141
	} else {
142
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
143 144 145
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
146

147
		ret = dev_requeue_skb(skb, q);
148
	}
149

150
	if (ret && netif_xmit_frozen_or_stopped(txq))
151 152
		ret = 0;

153
	return ret;
L
Linus Torvalds 已提交
154 155
}

156 157 158
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
Y
Ying Xue 已提交
159
 * __QDISC___STATE_RUNNING guarantees only one CPU can process
160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
static inline int qdisc_restart(struct Qdisc *q)
{
	struct netdev_queue *txq;
	struct net_device *dev;
	spinlock_t *root_lock;
	struct sk_buff *skb;

	/* Dequeue packet */
	skb = dequeue_skb(q);
	if (unlikely(!skb))
		return 0;
E
Eric Dumazet 已提交
186
	WARN_ON_ONCE(skb_dst_is_noref(skb));
187 188 189 190 191 192 193
	root_lock = qdisc_lock(q);
	dev = qdisc_dev(q);
	txq = netdev_get_tx_queue(dev, skb_get_queue_mapping(skb));

	return sch_direct_xmit(skb, q, dev, txq, root_lock);
}

194
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
195
{
J
jamal 已提交
196
	int quota = weight_p;
197

198
	while (qdisc_restart(q)) {
199
		/*
J
jamal 已提交
200 201 202
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
203
		 */
204
		if (--quota <= 0 || need_resched()) {
205
			__netif_schedule(q);
206
			break;
207 208
		}
	}
H
Herbert Xu 已提交
209

210
	qdisc_run_end(q);
H
Herbert Xu 已提交
211 212
}

213 214
unsigned long dev_trans_start(struct net_device *dev)
{
215
	unsigned long val, res;
216 217
	unsigned int i;

218 219 220
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
	res = dev->trans_start;
221 222 223 224 225 226
	for (i = 0; i < dev->num_tx_queues; i++) {
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
	dev->trans_start = res;
227

228 229 230 231
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

L
Linus Torvalds 已提交
232 233 234 235
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
236
	netif_tx_lock(dev);
237
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
238 239 240
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
241
			int some_queue_timedout = 0;
242
			unsigned int i;
243
			unsigned long trans_start;
244 245 246 247 248

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
249 250 251 252
				/*
				 * old device drivers set dev->trans_start
				 */
				trans_start = txq->trans_start ? : dev->trans_start;
253
				if (netif_xmit_stopped(txq) &&
254 255 256
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
257
					txq->trans_timeout++;
258 259 260
					break;
				}
			}
261

262 263
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
264
				       dev->name, netdev_drivername(dev), i);
265
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
266
			}
267 268 269
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
270 271 272
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
273
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
274 275 276 277 278 279

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
280
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
281 282
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
283 284
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
285 286 287 288 289 290 291 292 293 294 295
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
296
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
297
	if (del_timer(&dev->watchdog_timer))
298
		dev_put(dev);
H
Herbert Xu 已提交
299
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
300 301
}

302 303 304 305 306 307
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
308 309
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
310
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
311 312
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
313
		atomic_inc(&dev->carrier_changes);
314
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
315 316 317
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
318
}
319
EXPORT_SYMBOL(netif_carrier_on);
320

321 322 323 324 325 326
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
327 328
void netif_carrier_off(struct net_device *dev)
{
329 330 331
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
332
		atomic_inc(&dev->carrier_changes);
333
		linkwatch_fire_event(dev);
334
	}
335
}
336
EXPORT_SYMBOL(netif_carrier_off);
337

L
Linus Torvalds 已提交
338 339 340 341 342
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

343
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
L
Linus Torvalds 已提交
344 345 346 347 348
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

349
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
350 351 352 353
{
	return NULL;
}

354
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
355 356 357 358
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
359
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
360 361 362
	.owner		=	THIS_MODULE,
};

363 364
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
365
	.qdisc_sleeping	=	&noop_qdisc,
366 367
};

L
Linus Torvalds 已提交
368 369 370 371
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
372
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
373
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
374
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
375
	.dev_queue	=	&noop_netdev_queue,
376
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
377
};
378
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
379

380
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
381 382 383 384
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
385
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
386 387 388
	.owner		=	THIS_MODULE,
};

389 390 391
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
	.qdisc		=	&noqueue_qdisc,
392
	.qdisc_sleeping	=	&noqueue_qdisc,
393 394
};

L
Linus Torvalds 已提交
395 396 397 398 399 400
static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
401 402
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
	.dev_queue	=	&noqueue_netdev_queue,
403
	.busylock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
L
Linus Torvalds 已提交
404 405 406
};


E
Eric Dumazet 已提交
407 408 409
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
410 411 412 413 414 415 416

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
	struct sk_buff_head q[PFIFO_FAST_BANDS];
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
					     int band)
437
{
438
	return priv->q + band;
439 440
}

E
Eric Dumazet 已提交
441
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
442
{
443 444 445 446
	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
		struct sk_buff_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
447

448
		priv->bitmap |= (1 << band);
449
		qdisc->q.qlen++;
450
		return __qdisc_enqueue_tail(skb, qdisc, list);
451
	}
452 453

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
454 455
}

E
Eric Dumazet 已提交
456
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
457
{
458 459
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
460

461 462 463 464 465 466 467 468 469
	if (likely(band >= 0)) {
		struct sk_buff_head *list = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);

		qdisc->q.qlen--;
		if (skb_queue_empty(list))
			priv->bitmap &= ~(1 << band);

		return skb;
470
	}
471

L
Linus Torvalds 已提交
472 473 474
	return NULL;
}

E
Eric Dumazet 已提交
475
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
476
{
477 478 479 480 481
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
		struct sk_buff_head *list = band2list(priv, band);
482

483
		return skb_peek(list);
484 485 486 487 488
	}

	return NULL;
}

E
Eric Dumazet 已提交
489
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
490
{
491
	int prio;
492
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
493 494

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
495
		__qdisc_reset_queue(qdisc, band2list(priv, prio));
496

497
	priv->bitmap = 0;
498
	qdisc->qstats.backlog = 0;
499
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
500 501
}

502 503 504 505
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
506
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
507 508
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
509 510 511 512 513 514 515 516 517
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
518
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
519 520

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
521
		skb_queue_head_init(band2list(priv, prio));
522

523 524
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
525 526 527
	return 0;
}

528
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
529
	.id		=	"pfifo_fast",
530
	.priv_size	=	sizeof(struct pfifo_fast_priv),
531 532
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
533
	.peek		=	pfifo_fast_peek,
534 535 536
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
537 538 539
	.owner		=	THIS_MODULE,
};

540 541
static struct lock_class_key qdisc_tx_busylock;

542
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
543
			  const struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
544 545 546
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
547
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
548
	int err = -ENOBUFS;
549
	struct net_device *dev = dev_queue->dev;
L
Linus Torvalds 已提交
550

551 552 553
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
554
	if (!p)
555 556
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
557 558 559 560 561 562 563 564 565 566
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
L
Linus Torvalds 已提交
567 568
	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
569

570
	spin_lock_init(&sch->busylock);
571 572 573
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

L
Linus Torvalds 已提交
574 575 576
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
577
	sch->dev_queue = dev_queue;
578
	dev_hold(dev);
L
Linus Torvalds 已提交
579
	atomic_set(&sch->refcnt, 1);
580 581 582

	return sch;
errout:
583
	return ERR_PTR(err);
584 585
}

586
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
587 588
				const struct Qdisc_ops *ops,
				unsigned int parentid)
589 590
{
	struct Qdisc *sch;
591

592 593 594
	if (!try_module_get(ops->owner))
		goto errout;

595
	sch = qdisc_alloc(dev_queue, ops);
596 597
	if (IS_ERR(sch))
		goto errout;
598
	sch->parent = parentid;
599

L
Linus Torvalds 已提交
600 601 602
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

603
	qdisc_destroy(sch);
604
errout:
L
Linus Torvalds 已提交
605 606
	return NULL;
}
607
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
608

609
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
610 611 612

void qdisc_reset(struct Qdisc *qdisc)
{
613
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
614 615 616

	if (ops->reset)
		ops->reset(qdisc);
617

618 619 620 621 622
	if (qdisc->gso_skb) {
		kfree_skb(qdisc->gso_skb);
		qdisc->gso_skb = NULL;
		qdisc->q.qlen = 0;
	}
L
Linus Torvalds 已提交
623
}
624
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
625

E
Eric Dumazet 已提交
626 627 628 629 630 631 632
static void qdisc_rcu_free(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);

	kfree((char *) qdisc - qdisc->padded);
}

633
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
634
{
635 636
	const struct Qdisc_ops  *ops = qdisc->ops;

637 638 639 640
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !atomic_dec_and_test(&qdisc->refcnt))
		return;

641
#ifdef CONFIG_NET_SCHED
642 643
	qdisc_list_del(qdisc);

E
Eric Dumazet 已提交
644
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
645
#endif
646 647 648 649 650 651 652 653 654
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

655
	kfree_skb(qdisc->gso_skb);
E
Eric Dumazet 已提交
656 657 658 659 660
	/*
	 * gen_estimator est_timer() might access qdisc->q.lock,
	 * wait a RCU grace period before freeing qdisc.
	 */
	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
L
Linus Torvalds 已提交
661
}
662
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
663

664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
688
EXPORT_SYMBOL(dev_graft_qdisc);
689

690 691 692 693
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
E
Eric Dumazet 已提交
694
	struct Qdisc *qdisc = &noqueue_qdisc;
695 696

	if (dev->tx_queue_len) {
697
		qdisc = qdisc_create_dflt(dev_queue,
698
					  default_qdisc_ops, TC_H_ROOT);
699
		if (!qdisc) {
E
Eric Dumazet 已提交
700
			netdev_info(dev, "activation failed\n");
701 702
			return;
		}
703 704
		if (!netif_is_multiqueue(dev))
			qdisc->flags |= TCQ_F_ONETXQUEUE;
705 706 707 708
	}
	dev_queue->qdisc_sleeping = qdisc;
}

709 710 711 712 713 714 715 716 717 718 719 720
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		atomic_inc(&dev->qdisc->refcnt);
	} else {
721
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
722 723
		if (qdisc) {
			dev->qdisc = qdisc;
724
			qdisc->ops->attach(qdisc);
725 726 727 728
		}
	}
}

729 730 731 732
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
733
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
734 735
	int *need_watchdog_p = _need_watchdog;

736 737 738
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

739
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
740 741
	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
		dev_queue->trans_start = 0;
742
		*need_watchdog_p = 1;
743
	}
744 745
}

L
Linus Torvalds 已提交
746 747
void dev_activate(struct net_device *dev)
{
748
	int need_watchdog;
749

L
Linus Torvalds 已提交
750
	/* No queueing discipline is attached to device;
751 752
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
753 754
	 */

755 756
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
757

758 759 760 761
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

762 763
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
764 765
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
766 767

	if (need_watchdog) {
L
Linus Torvalds 已提交
768 769 770
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
771
}
772
EXPORT_SYMBOL(dev_activate);
773

774 775 776
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
777
{
778
	struct Qdisc *qdisc_default = _qdisc_default;
779 780 781
	struct Qdisc *qdisc;

	qdisc = dev_queue->qdisc;
782
	if (qdisc) {
783 784
		spin_lock_bh(qdisc_lock(qdisc));

785 786 787
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

788
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
789
		qdisc_reset(qdisc);
790

791
		spin_unlock_bh(qdisc_lock(qdisc));
792
	}
L
Linus Torvalds 已提交
793 794
}

795
static bool some_qdisc_is_busy(struct net_device *dev)
796 797 798 799 800
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
801
		spinlock_t *root_lock;
802
		struct Qdisc *q;
803 804 805
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
806
		q = dev_queue->qdisc_sleeping;
807
		root_lock = qdisc_lock(q);
808

809
		spin_lock_bh(root_lock);
810

811
		val = (qdisc_is_running(q) ||
812
		       test_bit(__QDISC_STATE_SCHED, &q->state));
813

814
		spin_unlock_bh(root_lock);
815 816 817 818 819 820 821

		if (val)
			return true;
	}
	return false;
}

822 823 824 825 826 827 828
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
829
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
830
{
831
	struct net_device *dev;
832
	bool sync_needed = false;
833

834
	list_for_each_entry(dev, head, close_list) {
835 836 837 838 839 840 841
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
842
		sync_needed |= !dev->dismantle;
843
	}
L
Linus Torvalds 已提交
844

845 846 847 848 849 850
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
	if (sync_needed)
		synchronize_net();
L
Linus Torvalds 已提交
851

852
	/* Wait for outstanding qdisc_run calls. */
853
	list_for_each_entry(dev, head, close_list)
854 855 856 857 858 859 860 861
		while (some_qdisc_is_busy(dev))
			yield();
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

862
	list_add(&dev->close_list, &single);
863
	dev_deactivate_many(&single);
864
	list_del(&single);
L
Linus Torvalds 已提交
865
}
866
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
867

868 869
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
870
				     void *_qdisc)
871
{
872 873
	struct Qdisc *qdisc = _qdisc;

874 875 876 877
	dev_queue->qdisc = qdisc;
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
878 879
void dev_init_scheduler(struct net_device *dev)
{
880
	dev->qdisc = &noop_qdisc;
881
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
882 883
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
884

885
	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
L
Linus Torvalds 已提交
886 887
}

888 889 890
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
891
{
892
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
893
	struct Qdisc *qdisc_default = _qdisc_default;
894 895

	if (qdisc) {
896
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
897
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
898 899

		qdisc_destroy(qdisc);
900
	}
901 902 903 904
}

void dev_shutdown(struct net_device *dev)
{
905
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
906 907
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
908 909 910
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

911
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
912
}
913

914
void psched_ratecfg_precompute(struct psched_ratecfg *r,
915 916
			       const struct tc_ratespec *conf,
			       u64 rate64)
917
{
918 919
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
920
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
921
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
922 923
	r->mult = 1;
	/*
924 925 926 927 928 929 930 931 932 933 934
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
935
	 */
936 937 938 939 940 941
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
942
				break;
943 944
			factor <<= 1;
			r->shift++;
945 946 947 948
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);