sch_generic.c 25.7 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
30
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
31
#include <net/dst.h>
32
#include <trace/events/qdisc.h>
L
Linus Torvalds 已提交
33

34 35 36 37
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
38 39
/* Main transmission queue. */

40
/* Modifications to data participating in scheduling must be protected with
41
 * qdisc_lock(qdisc) spinlock.
42 43
 *
 * The idea is the following:
44 45
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
46
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
47 48
 */

49
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
50
{
51
	q->gso_skb = skb;
52
	q->qstats.requeues++;
53
	qdisc_qstats_backlog_inc(q, skb);
54
	q->q.qlen++;	/* it's still part of the queue */
55
	__netif_schedule(q);
56

57 58 59
	return 0;
}

60 61
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
62 63
				 const struct netdev_queue *txq,
				 int *packets)
64
{
65
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
66 67

	while (bytelimit > 0) {
68
		struct sk_buff *nskb = q->dequeue(q);
69

70
		if (!nskb)
71 72
			break;

73 74 75
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
76
		(*packets)++; /* GSO counts as one pkt */
77
	}
78
	skb->next = NULL;
79 80
}

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
			q->skb_bad_txq = nskb;
			qdisc_qstats_backlog_inc(q, nskb);
			q->q.qlen++;
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
	skb->next = NULL;
}

109 110 111
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
112 113
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
114
{
115
	struct sk_buff *skb = q->gso_skb;
116
	const struct netdev_queue *txq = q->dev_queue;
117

118
	*packets = 1;
119
	if (unlikely(skb)) {
120 121
		/* skb in gso_skb were already validated */
		*validate = false;
122
		/* check the reason of requeuing without tx lock first */
123
		txq = skb_get_tx_queue(txq->dev, skb);
124
		if (!netif_xmit_frozen_or_stopped(txq)) {
125
			q->gso_skb = NULL;
126
			qdisc_qstats_backlog_dec(q, skb);
127 128
			q->q.qlen--;
		} else
129
			skb = NULL;
130
		goto trace;
131 132 133 134 135 136 137 138 139 140 141
	}
	*validate = true;
	skb = q->skb_bad_txq;
	if (unlikely(skb)) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			q->skb_bad_txq = NULL;
			qdisc_qstats_backlog_dec(q, skb);
			q->q.qlen--;
			goto bulk;
142
		}
143 144
		skb = NULL;
		goto trace;
145 146 147 148 149 150 151 152 153 154
	}
	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
	    !netif_xmit_frozen_or_stopped(txq))
		skb = q->dequeue(q);
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
155
	}
156 157
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
158 159 160
	return skb;
}

161
/*
162
 * Transmit possibly several skbs, and handle the return status as
163
 * required. Owning running seqcount bit guarantees that
164
 * only one CPU can execute this function.
165 166 167 168 169
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 */
170 171
int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		    struct net_device *dev, struct netdev_queue *txq,
172
		    spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
173
{
174
	int ret = NETDEV_TX_BUSY;
175 176

	/* And release qdisc */
177 178
	if (root_lock)
		spin_unlock(root_lock);
179

180 181 182
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
		skb = validate_xmit_skb_list(skb, dev);
183

184
	if (likely(skb)) {
185 186 187
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
188

189
		HARD_TX_UNLOCK(dev, txq);
190
	} else {
191 192
		if (root_lock)
			spin_lock(root_lock);
193
		return qdisc_qlen(q);
194
	}
195 196 197

	if (root_lock)
		spin_lock(root_lock);
198

199 200
	if (dev_xmit_complete(ret)) {
		/* Driver sent out skb successfully or skb was consumed */
201
		ret = qdisc_qlen(q);
202
	} else {
203
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
204 205 206
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
207

208
		ret = dev_requeue_skb(skb, q);
209
	}
210

211
	if (ret && netif_xmit_frozen_or_stopped(txq))
212 213
		ret = 0;

214
	return ret;
L
Linus Torvalds 已提交
215 216
}

217 218 219
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
220
 * running seqcount guarantees only one CPU can process
221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
236
static inline int qdisc_restart(struct Qdisc *q, int *packets)
237
{
238
	spinlock_t *root_lock = NULL;
239 240 241
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
242
	bool validate;
243 244

	/* Dequeue packet */
245
	skb = dequeue_skb(q, &validate, packets);
246 247
	if (unlikely(!skb))
		return 0;
248

249 250 251
	if (!(q->flags & TCQ_F_NOLOCK))
		root_lock = qdisc_lock(q);

252
	dev = qdisc_dev(q);
253
	txq = skb_get_tx_queue(dev, skb);
254

255
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
256 257
}

258
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
259
{
260
	int quota = dev_tx_weight;
261
	int packets;
262

263
	while (qdisc_restart(q, &packets)) {
264
		/*
J
jamal 已提交
265 266 267
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
268
		 */
269 270
		quota -= packets;
		if (quota <= 0 || need_resched()) {
271
			__netif_schedule(q);
272
			break;
273 274
		}
	}
H
Herbert Xu 已提交
275 276
}

277 278
unsigned long dev_trans_start(struct net_device *dev)
{
279
	unsigned long val, res;
280 281
	unsigned int i;

282 283
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
F
Florian Westphal 已提交
284 285
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
286 287 288 289
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
290

291 292 293 294
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

295
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
296
{
297
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
298

H
Herbert Xu 已提交
299
	netif_tx_lock(dev);
300
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
301 302 303
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
304
			int some_queue_timedout = 0;
305
			unsigned int i;
306
			unsigned long trans_start;
307 308 309 310 311

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
312
				trans_start = txq->trans_start;
313
				if (netif_xmit_stopped(txq) &&
314 315 316
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
317
					txq->trans_timeout++;
318 319 320
					break;
				}
			}
321

322 323
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
324
				       dev->name, netdev_drivername(dev), i);
325
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
326
			}
327 328 329
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
330 331 332
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
333
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
334 335 336 337 338 339

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
340
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
341 342
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
343 344
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
345 346 347 348 349 350 351 352 353 354 355
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
356
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
357
	if (del_timer(&dev->watchdog_timer))
358
		dev_put(dev);
H
Herbert Xu 已提交
359
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
360 361
}

362 363 364 365 366 367
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
368 369
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
370
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
371 372
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
373
		atomic_inc(&dev->carrier_changes);
374
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
375 376 377
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
378
}
379
EXPORT_SYMBOL(netif_carrier_on);
380

381 382 383 384 385 386
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
387 388
void netif_carrier_off(struct net_device *dev)
{
389 390 391
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
392
		atomic_inc(&dev->carrier_changes);
393
		linkwatch_fire_event(dev);
394
	}
395
}
396
EXPORT_SYMBOL(netif_carrier_off);
397

L
Linus Torvalds 已提交
398 399 400 401 402
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

403 404
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
405
{
406
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
407 408 409
	return NET_XMIT_CN;
}

410
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
411 412 413 414
{
	return NULL;
}

415
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
416 417 418 419
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
420
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
421 422 423
	.owner		=	THIS_MODULE,
};

424 425
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
426
	.qdisc_sleeping	=	&noop_qdisc,
427 428
};

L
Linus Torvalds 已提交
429 430 431 432
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
433
	.ops		=	&noop_qdisc_ops,
434
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
435
	.dev_queue	=	&noop_netdev_queue,
436
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
437
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
438
};
439
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
440

P
Phil Sutter 已提交
441 442 443 444 445 446 447 448 449 450
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
451 452
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
453
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
454 455
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
456
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
457 458 459
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
460 461 462
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
463 464 465 466 467 468 469

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

470 471 472 473 474 475 476
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
477
	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
478 479 480 481 482 483 484 485 486 487
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

488
static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
489
					     int band)
490
{
491
	return priv->q + band;
492 493
}

494 495
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
496
{
497
	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
498 499
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
500
		struct qdisc_skb_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
501

502
		priv->bitmap |= (1 << band);
503
		qdisc->q.qlen++;
504
		return __qdisc_enqueue_tail(skb, qdisc, list);
505
	}
506

507
	return qdisc_drop(skb, qdisc, to_free);
L
Linus Torvalds 已提交
508 509
}

E
Eric Dumazet 已提交
510
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
511
{
512 513
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
514

515
	if (likely(band >= 0)) {
516 517
		struct qdisc_skb_head *qh = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qh);
518 519 520 521 522

		if (likely(skb != NULL)) {
			qdisc_qstats_backlog_dec(qdisc, skb);
			qdisc_bstats_update(qdisc, skb);
		}
523 524

		qdisc->q.qlen--;
525
		if (qh->qlen == 0)
526 527 528
			priv->bitmap &= ~(1 << band);

		return skb;
529
	}
530

L
Linus Torvalds 已提交
531 532 533
	return NULL;
}

E
Eric Dumazet 已提交
534
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
535
{
536 537 538 539
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
540
		struct qdisc_skb_head *qh = band2list(priv, band);
541

542
		return qh->head;
543 544 545 546 547
	}

	return NULL;
}

E
Eric Dumazet 已提交
548
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
549
{
550
	int prio;
551
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
552 553

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
554
		__qdisc_reset_queue(band2list(priv, prio));
555

556
	priv->bitmap = 0;
557
	qdisc->qstats.backlog = 0;
558
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
559 560
}

561 562 563 564
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
565
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
566 567
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
568 569 570 571 572 573 574 575 576
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
577
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
578 579

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
580
		qdisc_skb_head_init(band2list(priv, prio));
581

582 583
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
584 585 586
	return 0;
}

587
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
588
	.id		=	"pfifo_fast",
589
	.priv_size	=	sizeof(struct pfifo_fast_priv),
590 591
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
592
	.peek		=	pfifo_fast_peek,
593 594 595
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
596 597
	.owner		=	THIS_MODULE,
};
598
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
599

600
static struct lock_class_key qdisc_tx_busylock;
601
static struct lock_class_key qdisc_running_key;
602

603
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
604
			  const struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
605 606 607
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
608
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
609
	int err = -ENOBUFS;
610 611 612 613 614 615
	struct net_device *dev;

	if (!dev_queue) {
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
616

617
	dev = dev_queue->dev;
618 619 620
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
621
	if (!p)
622 623
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
624 625 626 627 628 629 630 631 632 633
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
634 635
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
636

637
	spin_lock_init(&sch->busylock);
638 639 640
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

641 642 643 644
	seqcount_init(&sch->running);
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);

L
Linus Torvalds 已提交
645 646 647
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
648
	sch->dev_queue = dev_queue;
649
	dev_hold(dev);
650
	refcount_set(&sch->refcnt, 1);
651 652 653

	return sch;
errout:
654
	return ERR_PTR(err);
655 656
}

657
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
658 659
				const struct Qdisc_ops *ops,
				unsigned int parentid)
660 661
{
	struct Qdisc *sch;
662

663
	if (!try_module_get(ops->owner))
664
		return NULL;
665

666
	sch = qdisc_alloc(dev_queue, ops);
667 668 669 670
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
671
	sch->parent = parentid;
672

L
Linus Torvalds 已提交
673 674 675
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

676
	qdisc_destroy(sch);
L
Linus Torvalds 已提交
677 678
	return NULL;
}
679
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
680

681
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
682 683 684

void qdisc_reset(struct Qdisc *qdisc)
{
685
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
686 687 688

	if (ops->reset)
		ops->reset(qdisc);
689

690 691 692
	kfree_skb(qdisc->skb_bad_txq);
	qdisc->skb_bad_txq = NULL;

693
	if (qdisc->gso_skb) {
694
		kfree_skb_list(qdisc->gso_skb);
695 696
		qdisc->gso_skb = NULL;
	}
697
	qdisc->q.qlen = 0;
698
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
699
}
700
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
701

702
static void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
703
{
704
	if (qdisc_is_percpu_stats(qdisc)) {
705
		free_percpu(qdisc->cpu_bstats);
706 707
		free_percpu(qdisc->cpu_qstats);
	}
708

E
Eric Dumazet 已提交
709 710 711
	kfree((char *) qdisc - qdisc->padded);
}

712
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
713
{
714 715
	const struct Qdisc_ops  *ops = qdisc->ops;

716
	if (qdisc->flags & TCQ_F_BUILTIN ||
717
	    !refcount_dec_and_test(&qdisc->refcnt))
718 719
		return;

720
#ifdef CONFIG_NET_SCHED
721
	qdisc_hash_del(qdisc);
722

E
Eric Dumazet 已提交
723
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
724
#endif
725
	gen_kill_estimator(&qdisc->rate_est);
726 727 728 729 730 731 732 733
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

734
	kfree_skb_list(qdisc->gso_skb);
735
	kfree_skb(qdisc->skb_bad_txq);
736
	qdisc_free(qdisc);
L
Linus Torvalds 已提交
737
}
738
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
739

740 741 742 743 744 745 746 747 748 749 750
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
751
	if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1)
752 753 754 755 756 757 758 759 760 761 762 763
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
764
EXPORT_SYMBOL(dev_graft_qdisc);
765

766 767 768 769
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
770 771
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
772

773 774 775 776 777 778 779
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;

	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT);
	if (!qdisc) {
		netdev_info(dev, "activation failed\n");
		return;
780
	}
781
	if (!netif_is_multiqueue(dev))
782
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
783 784 785
	dev_queue->qdisc_sleeping = qdisc;
}

786 787 788 789 790 791 792
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

793 794
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
795 796
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
797
		qdisc_refcount_inc(dev->qdisc);
798
	} else {
799
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
800 801
		if (qdisc) {
			dev->qdisc = qdisc;
802
			qdisc->ops->attach(qdisc);
803 804
		}
	}
805
#ifdef CONFIG_NET_SCHED
806
	if (dev->qdisc != &noop_qdisc)
807
		qdisc_hash_add(dev->qdisc, false);
808
#endif
809 810
}

811 812 813 814
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
815
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
816 817
	int *need_watchdog_p = _need_watchdog;

818 819 820
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

821
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
822
	if (need_watchdog_p) {
823
		dev_queue->trans_start = 0;
824
		*need_watchdog_p = 1;
825
	}
826 827
}

L
Linus Torvalds 已提交
828 829
void dev_activate(struct net_device *dev)
{
830
	int need_watchdog;
831

L
Linus Torvalds 已提交
832
	/* No queueing discipline is attached to device;
833 834
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
835 836
	 */

837 838
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
839

840 841 842 843
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

844 845
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
846 847
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
848 849

	if (need_watchdog) {
850
		netif_trans_update(dev);
L
Linus Torvalds 已提交
851 852
		dev_watchdog_up(dev);
	}
853
}
854
EXPORT_SYMBOL(dev_activate);
855

856 857 858
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
859
{
860
	struct Qdisc *qdisc_default = _qdisc_default;
861 862
	struct Qdisc *qdisc;

863
	qdisc = rtnl_dereference(dev_queue->qdisc);
864
	if (qdisc) {
865 866
		spin_lock_bh(qdisc_lock(qdisc));

867 868 869
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

870
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
871
		qdisc_reset(qdisc);
872

873
		spin_unlock_bh(qdisc_lock(qdisc));
874
	}
L
Linus Torvalds 已提交
875 876
}

877
static bool some_qdisc_is_busy(struct net_device *dev)
878 879 880 881 882
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
883
		spinlock_t *root_lock;
884
		struct Qdisc *q;
885 886 887
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
888
		q = dev_queue->qdisc_sleeping;
889

890 891 892 893 894
		if (q->flags & TCQ_F_NOLOCK) {
			val = test_bit(__QDISC_STATE_SCHED, &q->state);
		} else {
			root_lock = qdisc_lock(q);
			spin_lock_bh(root_lock);
895

896 897
			val = (qdisc_is_running(q) ||
			       test_bit(__QDISC_STATE_SCHED, &q->state));
898

899 900
			spin_unlock_bh(root_lock);
		}
901 902 903 904 905 906 907

		if (val)
			return true;
	}
	return false;
}

908 909 910 911 912 913 914
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
915
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
916
{
917
	struct net_device *dev;
918
	bool sync_needed = false;
919

920
	list_for_each_entry(dev, head, close_list) {
921 922 923 924 925 926 927
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
928
		sync_needed |= !dev->dismantle;
929
	}
L
Linus Torvalds 已提交
930

931 932 933 934 935 936
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
	if (sync_needed)
		synchronize_net();
L
Linus Torvalds 已提交
937

938
	/* Wait for outstanding qdisc_run calls. */
939
	list_for_each_entry(dev, head, close_list)
940 941 942 943 944 945 946 947
		while (some_qdisc_is_busy(dev))
			yield();
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

948
	list_add(&dev->close_list, &single);
949
	dev_deactivate_many(&single);
950
	list_del(&single);
L
Linus Torvalds 已提交
951
}
952
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
953

954 955
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
956
				     void *_qdisc)
957
{
958 959
	struct Qdisc *qdisc = _qdisc;

960
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
961 962 963
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
964 965
void dev_init_scheduler(struct net_device *dev)
{
966
	dev->qdisc = &noop_qdisc;
967
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
968 969
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
970

971
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
972 973
}

974 975 976
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
977
{
978
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
979
	struct Qdisc *qdisc_default = _qdisc_default;
980 981

	if (qdisc) {
982
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
983
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
984 985

		qdisc_destroy(qdisc);
986
	}
987 988 989 990
}

void dev_shutdown(struct net_device *dev)
{
991
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
992 993
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
994 995 996
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

997
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
998
}
999

1000
void psched_ratecfg_precompute(struct psched_ratecfg *r,
1001 1002
			       const struct tc_ratespec *conf,
			       u64 rate64)
1003
{
1004 1005
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1006
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1007
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1008 1009
	r->mult = 1;
	/*
1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1021
	 */
1022 1023 1024 1025 1026 1027
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1028
				break;
1029 1030
			factor <<= 1;
			r->shift++;
1031 1032 1033 1034
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
	struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
	 * we are about to modify. So wait until previous call_rcu_bh callback
	 * is done.
	 */
	rcu_barrier_bh();
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
		/* This is counterpart of the rcu barrier above. We need to
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
		call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);