sch_generic.c 23.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
30
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
31
#include <net/dst.h>
L
Linus Torvalds 已提交
32

33 34 35 36
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
37 38
/* Main transmission queue. */

39
/* Modifications to data participating in scheduling must be protected with
40
 * qdisc_lock(qdisc) spinlock.
41 42
 *
 * The idea is the following:
43 44
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
45
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
46 47
 */

48
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
49
{
E
Eric Dumazet 已提交
50
	skb_dst_force(skb);
51
	q->gso_skb = skb;
52
	q->qstats.requeues++;
53
	q->q.qlen++;	/* it's still part of the queue */
54
	__netif_schedule(q);
55

56 57 58
	return 0;
}

59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
static struct sk_buff *try_bulk_dequeue_skb(struct Qdisc *q,
					    struct sk_buff *head_skb,
					    int bytelimit)
{
	struct sk_buff *skb, *tail_skb = head_skb;

	while (bytelimit > 0) {
		skb = q->dequeue(q);
		if (!skb)
			break;

		bytelimit -= skb->len; /* covers GSO len */
		skb = validate_xmit_skb(skb, qdisc_dev(q));
		if (!skb)
			break;

75 76 77
		while (tail_skb->next) /* GSO list goto tail */
			tail_skb = tail_skb->next;

78 79 80 81 82 83 84 85 86 87
		tail_skb->next = skb;
		tail_skb = skb;
	}

	return head_skb;
}

/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
88
static inline struct sk_buff *dequeue_skb(struct Qdisc *q)
89
{
90
	struct sk_buff *skb = q->gso_skb;
91
	const struct netdev_queue *txq = q->dev_queue;
92

93 94
	if (unlikely(skb)) {
		/* check the reason of requeuing without tx lock first */
95
		txq = skb_get_tx_queue(txq->dev, skb);
96
		if (!netif_xmit_frozen_or_stopped(txq)) {
97
			q->gso_skb = NULL;
98 99
			q->q.qlen--;
		} else
100 101
			skb = NULL;
	} else {
102 103 104 105
		if (!(q->flags & TCQ_F_ONETXQUEUE) ||
		    !netif_xmit_frozen_or_stopped(txq)) {
			int bytelimit = qdisc_avail_bulklimit(txq);

106
			skb = q->dequeue(q);
107 108
			if (skb) {
				bytelimit -= skb->len;
109
				skb = validate_xmit_skb(skb, qdisc_dev(q));
110 111 112
			}
			if (skb && qdisc_may_bulk(q))
				skb = try_bulk_dequeue_skb(q, skb, bytelimit);
113
		}
114
	}
115 116 117 118

	return skb;
}

119
static inline int handle_dev_cpu_collision(struct sk_buff *skb,
120
					   struct netdev_queue *dev_queue,
121
					   struct Qdisc *q)
122
{
123
	int ret;
124

125
	if (unlikely(dev_queue->xmit_lock_owner == smp_processor_id())) {
126 127 128 129 130 131
		/*
		 * Same CPU holding the lock. It may be a transient
		 * configuration error, when hard_start_xmit() recurses. We
		 * detect it by checking xmit owner and drop the packet when
		 * deadloop is detected. Return OK to try the next skb.
		 */
132
		kfree_skb_list(skb);
133 134
		net_warn_ratelimited("Dead loop on netdevice %s, fix it urgently!\n",
				     dev_queue->dev->name);
135 136 137 138 139 140
		ret = qdisc_qlen(q);
	} else {
		/*
		 * Another cpu is holding lock, requeue & delay xmits for
		 * some time.
		 */
E
Eric Dumazet 已提交
141
		__this_cpu_inc(softnet_data.cpu_collision);
142
		ret = dev_requeue_skb(skb, q);
143 144
	}

145
	return ret;
146 147
}

148
/*
149 150 151
 * Transmit possibly several skbs, and handle the return status as
 * required. Holding the __QDISC___STATE_RUNNING bit guarantees that
 * only one CPU can execute this function.
152 153 154 155 156
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 */
157 158 159
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		    struct net_device *dev, struct netdev_queue *txq,
		    spinlock_t *root_lock)
L
Linus Torvalds 已提交
160
{
161
	int ret = NETDEV_TX_BUSY;
162 163 164

	/* And release qdisc */
	spin_unlock(root_lock);
165

166
	HARD_TX_LOCK(dev, txq, smp_processor_id());
167
	if (!netif_xmit_frozen_or_stopped(txq))
168
		skb = dev_hard_start_xmit(skb, dev, txq, &ret);
169

170
	HARD_TX_UNLOCK(dev, txq);
171

172
	spin_lock(root_lock);
173

174 175
	if (dev_xmit_complete(ret)) {
		/* Driver sent out skb successfully or skb was consumed */
176
		ret = qdisc_qlen(q);
177
	} else if (ret == NETDEV_TX_LOCKED) {
178
		/* Driver try lock failed */
179
		ret = handle_dev_cpu_collision(skb, txq, q);
180
	} else {
181
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
182 183 184
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
185

186
		ret = dev_requeue_skb(skb, q);
187
	}
188

189
	if (ret && netif_xmit_frozen_or_stopped(txq))
190 191
		ret = 0;

192
	return ret;
L
Linus Torvalds 已提交
193 194
}

195 196 197
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
Y
Ying Xue 已提交
198
 * __QDISC___STATE_RUNNING guarantees only one CPU can process
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
static inline int qdisc_restart(struct Qdisc *q)
{
	struct netdev_queue *txq;
	struct net_device *dev;
	spinlock_t *root_lock;
	struct sk_buff *skb;

	/* Dequeue packet */
	skb = dequeue_skb(q);
	if (unlikely(!skb))
		return 0;
225

E
Eric Dumazet 已提交
226
	WARN_ON_ONCE(skb_dst_is_noref(skb));
227

228 229
	root_lock = qdisc_lock(q);
	dev = qdisc_dev(q);
230
	txq = skb_get_tx_queue(dev, skb);
231 232 233 234

	return sch_direct_xmit(skb, q, dev, txq, root_lock);
}

235
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
236
{
J
jamal 已提交
237
	int quota = weight_p;
238

239
	while (qdisc_restart(q)) {
240
		/*
J
jamal 已提交
241 242 243
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
244
		 */
245
		if (--quota <= 0 || need_resched()) {
246
			__netif_schedule(q);
247
			break;
248 249
		}
	}
H
Herbert Xu 已提交
250

251
	qdisc_run_end(q);
H
Herbert Xu 已提交
252 253
}

254 255
unsigned long dev_trans_start(struct net_device *dev)
{
256
	unsigned long val, res;
257 258
	unsigned int i;

259 260 261
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
	res = dev->trans_start;
262 263 264 265 266 267
	for (i = 0; i < dev->num_tx_queues; i++) {
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
	dev->trans_start = res;
268

269 270 271 272
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

L
Linus Torvalds 已提交
273 274 275 276
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
277
	netif_tx_lock(dev);
278
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
279 280 281
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
282
			int some_queue_timedout = 0;
283
			unsigned int i;
284
			unsigned long trans_start;
285 286 287 288 289

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
290 291 292 293
				/*
				 * old device drivers set dev->trans_start
				 */
				trans_start = txq->trans_start ? : dev->trans_start;
294
				if (netif_xmit_stopped(txq) &&
295 296 297
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
298
					txq->trans_timeout++;
299 300 301
					break;
				}
			}
302

303 304
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
305
				       dev->name, netdev_drivername(dev), i);
306
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
307
			}
308 309 310
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
311 312 313
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
314
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
315 316 317 318 319 320

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
321
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
322 323
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
324 325
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
326 327 328 329 330 331 332 333 334 335 336
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
337
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
338
	if (del_timer(&dev->watchdog_timer))
339
		dev_put(dev);
H
Herbert Xu 已提交
340
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
341 342
}

343 344 345 346 347 348
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
349 350
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
351
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
352 353
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
354
		atomic_inc(&dev->carrier_changes);
355
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
356 357 358
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
359
}
360
EXPORT_SYMBOL(netif_carrier_on);
361

362 363 364 365 366 367
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
368 369
void netif_carrier_off(struct net_device *dev)
{
370 371 372
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
373
		atomic_inc(&dev->carrier_changes);
374
		linkwatch_fire_event(dev);
375
	}
376
}
377
EXPORT_SYMBOL(netif_carrier_off);
378

L
Linus Torvalds 已提交
379 380 381 382 383
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

384
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
L
Linus Torvalds 已提交
385 386 387 388 389
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

390
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
391 392 393 394
{
	return NULL;
}

395
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
396 397 398 399
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
400
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
401 402 403
	.owner		=	THIS_MODULE,
};

404 405
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
406
	.qdisc_sleeping	=	&noop_qdisc,
407 408
};

L
Linus Torvalds 已提交
409 410 411 412
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
413
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
414
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
415
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
416
	.dev_queue	=	&noop_netdev_queue,
417
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
418
};
419
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
420

421
static struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
422 423 424 425
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
426
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
427 428 429
	.owner		=	THIS_MODULE,
};

430 431 432
static struct Qdisc noqueue_qdisc;
static struct netdev_queue noqueue_netdev_queue = {
	.qdisc		=	&noqueue_qdisc,
433
	.qdisc_sleeping	=	&noqueue_qdisc,
434 435
};

L
Linus Torvalds 已提交
436 437 438 439 440 441
static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
442 443
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock),
	.dev_queue	=	&noqueue_netdev_queue,
444
	.busylock	=	__SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock),
L
Linus Torvalds 已提交
445 446 447
};


E
Eric Dumazet 已提交
448 449 450
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
451 452 453 454 455 456 457

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
	struct sk_buff_head q[PFIFO_FAST_BANDS];
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

static inline struct sk_buff_head *band2list(struct pfifo_fast_priv *priv,
					     int band)
478
{
479
	return priv->q + band;
480 481
}

E
Eric Dumazet 已提交
482
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc)
483
{
484 485 486 487
	if (skb_queue_len(&qdisc->q) < qdisc_dev(qdisc)->tx_queue_len) {
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
		struct sk_buff_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
488

489
		priv->bitmap |= (1 << band);
490
		qdisc->q.qlen++;
491
		return __qdisc_enqueue_tail(skb, qdisc, list);
492
	}
493 494

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
495 496
}

E
Eric Dumazet 已提交
497
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
498
{
499 500
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
501

502 503 504 505 506 507 508 509 510
	if (likely(band >= 0)) {
		struct sk_buff_head *list = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qdisc, list);

		qdisc->q.qlen--;
		if (skb_queue_empty(list))
			priv->bitmap &= ~(1 << band);

		return skb;
511
	}
512

L
Linus Torvalds 已提交
513 514 515
	return NULL;
}

E
Eric Dumazet 已提交
516
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
517
{
518 519 520 521 522
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
		struct sk_buff_head *list = band2list(priv, band);
523

524
		return skb_peek(list);
525 526 527 528 529
	}

	return NULL;
}

E
Eric Dumazet 已提交
530
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
531
{
532
	int prio;
533
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
534 535

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
536
		__qdisc_reset_queue(qdisc, band2list(priv, prio));
537

538
	priv->bitmap = 0;
539
	qdisc->qstats.backlog = 0;
540
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
541 542
}

543 544 545 546
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
547
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
548 549
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
550 551 552 553 554 555 556 557 558
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
559
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
560 561

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
562
		__skb_queue_head_init(band2list(priv, prio));
563

564 565
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
566 567 568
	return 0;
}

569
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
570
	.id		=	"pfifo_fast",
571
	.priv_size	=	sizeof(struct pfifo_fast_priv),
572 573
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
574
	.peek		=	pfifo_fast_peek,
575 576 577
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
578 579 580
	.owner		=	THIS_MODULE,
};

581 582
static struct lock_class_key qdisc_tx_busylock;

583
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
584
			  const struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
585 586 587
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
588
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
589
	int err = -ENOBUFS;
590
	struct net_device *dev = dev_queue->dev;
L
Linus Torvalds 已提交
591

592 593 594
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
595
	if (!p)
596 597
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
598 599 600 601 602 603 604 605 606 607
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
L
Linus Torvalds 已提交
608 609
	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
610

611
	spin_lock_init(&sch->busylock);
612 613 614
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

L
Linus Torvalds 已提交
615 616 617
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
618
	sch->dev_queue = dev_queue;
619
	dev_hold(dev);
L
Linus Torvalds 已提交
620
	atomic_set(&sch->refcnt, 1);
621 622 623

	return sch;
errout:
624
	return ERR_PTR(err);
625 626
}

627
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
628 629
				const struct Qdisc_ops *ops,
				unsigned int parentid)
630 631
{
	struct Qdisc *sch;
632

633 634 635
	if (!try_module_get(ops->owner))
		goto errout;

636
	sch = qdisc_alloc(dev_queue, ops);
637 638
	if (IS_ERR(sch))
		goto errout;
639
	sch->parent = parentid;
640

L
Linus Torvalds 已提交
641 642 643
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

644
	qdisc_destroy(sch);
645
errout:
L
Linus Torvalds 已提交
646 647
	return NULL;
}
648
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
649

650
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
651 652 653

void qdisc_reset(struct Qdisc *qdisc)
{
654
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
655 656 657

	if (ops->reset)
		ops->reset(qdisc);
658

659
	if (qdisc->gso_skb) {
660
		kfree_skb_list(qdisc->gso_skb);
661 662 663
		qdisc->gso_skb = NULL;
		qdisc->q.qlen = 0;
	}
L
Linus Torvalds 已提交
664
}
665
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
666

E
Eric Dumazet 已提交
667 668 669 670
static void qdisc_rcu_free(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);

671 672 673
	if (qdisc_is_percpu_stats(qdisc))
		free_percpu(qdisc->cpu_bstats);

E
Eric Dumazet 已提交
674 675 676
	kfree((char *) qdisc - qdisc->padded);
}

677
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
678
{
679 680
	const struct Qdisc_ops  *ops = qdisc->ops;

681 682 683 684
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !atomic_dec_and_test(&qdisc->refcnt))
		return;

685
#ifdef CONFIG_NET_SCHED
686 687
	qdisc_list_del(qdisc);

E
Eric Dumazet 已提交
688
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
689
#endif
690 691 692 693 694 695 696 697 698
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

699
	kfree_skb_list(qdisc->gso_skb);
E
Eric Dumazet 已提交
700 701 702 703 704
	/*
	 * gen_estimator est_timer() might access qdisc->q.lock,
	 * wait a RCU grace period before freeing qdisc.
	 */
	call_rcu(&qdisc->rcu_head, qdisc_rcu_free);
L
Linus Torvalds 已提交
705
}
706
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
707

708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
	if (oqdisc && atomic_read(&oqdisc->refcnt) <= 1)
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
732
EXPORT_SYMBOL(dev_graft_qdisc);
733

734 735 736 737
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
E
Eric Dumazet 已提交
738
	struct Qdisc *qdisc = &noqueue_qdisc;
739 740

	if (dev->tx_queue_len) {
741
		qdisc = qdisc_create_dflt(dev_queue,
742
					  default_qdisc_ops, TC_H_ROOT);
743
		if (!qdisc) {
E
Eric Dumazet 已提交
744
			netdev_info(dev, "activation failed\n");
745 746
			return;
		}
747 748
		if (!netif_is_multiqueue(dev))
			qdisc->flags |= TCQ_F_ONETXQUEUE;
749 750 751 752
	}
	dev_queue->qdisc_sleeping = qdisc;
}

753 754 755 756 757 758 759 760 761 762 763 764
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

	if (!netif_is_multiqueue(dev) || dev->tx_queue_len == 0) {
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		atomic_inc(&dev->qdisc->refcnt);
	} else {
765
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
766 767
		if (qdisc) {
			dev->qdisc = qdisc;
768
			qdisc->ops->attach(qdisc);
769 770 771 772
		}
	}
}

773 774 775 776
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
777
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
778 779
	int *need_watchdog_p = _need_watchdog;

780 781 782
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

783
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
784 785
	if (need_watchdog_p && new_qdisc != &noqueue_qdisc) {
		dev_queue->trans_start = 0;
786
		*need_watchdog_p = 1;
787
	}
788 789
}

L
Linus Torvalds 已提交
790 791
void dev_activate(struct net_device *dev)
{
792
	int need_watchdog;
793

L
Linus Torvalds 已提交
794
	/* No queueing discipline is attached to device;
795 796
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
797 798
	 */

799 800
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
801

802 803 804 805
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

806 807
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
808 809
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
810 811

	if (need_watchdog) {
L
Linus Torvalds 已提交
812 813 814
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
815
}
816
EXPORT_SYMBOL(dev_activate);
817

818 819 820
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
821
{
822
	struct Qdisc *qdisc_default = _qdisc_default;
823 824
	struct Qdisc *qdisc;

825
	qdisc = rtnl_dereference(dev_queue->qdisc);
826
	if (qdisc) {
827 828
		spin_lock_bh(qdisc_lock(qdisc));

829 830 831
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

832
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
833
		qdisc_reset(qdisc);
834

835
		spin_unlock_bh(qdisc_lock(qdisc));
836
	}
L
Linus Torvalds 已提交
837 838
}

839
static bool some_qdisc_is_busy(struct net_device *dev)
840 841 842 843 844
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
845
		spinlock_t *root_lock;
846
		struct Qdisc *q;
847 848 849
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
850
		q = dev_queue->qdisc_sleeping;
851
		root_lock = qdisc_lock(q);
852

853
		spin_lock_bh(root_lock);
854

855
		val = (qdisc_is_running(q) ||
856
		       test_bit(__QDISC_STATE_SCHED, &q->state));
857

858
		spin_unlock_bh(root_lock);
859 860 861 862 863 864 865

		if (val)
			return true;
	}
	return false;
}

866 867 868 869 870 871 872
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
873
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
874
{
875
	struct net_device *dev;
876
	bool sync_needed = false;
877

878
	list_for_each_entry(dev, head, close_list) {
879 880 881 882 883 884 885
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
886
		sync_needed |= !dev->dismantle;
887
	}
L
Linus Torvalds 已提交
888

889 890 891 892 893 894
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
	if (sync_needed)
		synchronize_net();
L
Linus Torvalds 已提交
895

896
	/* Wait for outstanding qdisc_run calls. */
897
	list_for_each_entry(dev, head, close_list)
898 899 900 901 902 903 904 905
		while (some_qdisc_is_busy(dev))
			yield();
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

906
	list_add(&dev->close_list, &single);
907
	dev_deactivate_many(&single);
908
	list_del(&single);
L
Linus Torvalds 已提交
909
}
910
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
911

912 913
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
914
				     void *_qdisc)
915
{
916 917
	struct Qdisc *qdisc = _qdisc;

918
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
919 920 921
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
922 923
void dev_init_scheduler(struct net_device *dev)
{
924
	dev->qdisc = &noop_qdisc;
925
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
926 927
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
928

929
	setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
L
Linus Torvalds 已提交
930 931
}

932 933 934
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
935
{
936
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
937
	struct Qdisc *qdisc_default = _qdisc_default;
938 939

	if (qdisc) {
940
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
941
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
942 943

		qdisc_destroy(qdisc);
944
	}
945 946 947 948
}

void dev_shutdown(struct net_device *dev)
{
949
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
950 951
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
952 953 954
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

955
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
956
}
957

958
void psched_ratecfg_precompute(struct psched_ratecfg *r,
959 960
			       const struct tc_ratespec *conf,
			       u64 rate64)
961
{
962 963
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
964
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
965
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
966 967
	r->mult = 1;
	/*
968 969 970 971 972 973 974 975 976 977 978
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
979
	 */
980 981 982 983 984 985
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
986
				break;
987 988
			factor <<= 1;
			r->shift++;
989 990 991 992
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);