sch_generic.c 32.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <linux/skb_array.h>
30
#include <linux/if_macvlan.h>
31
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
32
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
33
#include <net/dst.h>
34
#include <trace/events/qdisc.h>
35
#include <net/xfrm.h>
L
Linus Torvalds 已提交
36

37 38 39 40
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
41 42
/* Main transmission queue. */

43
/* Modifications to data participating in scheduling must be protected with
44
 * qdisc_lock(qdisc) spinlock.
45 46
 *
 * The idea is the following:
47 48
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
49
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
50
 */
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108

static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
	const struct netdev_queue *txq = q->dev_queue;
	spinlock_t *lock = NULL;
	struct sk_buff *skb;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	skb = skb_peek(&q->skb_bad_txq);
	if (skb) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			skb = __skb_dequeue(&q->skb_bad_txq);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
				qdisc_qstats_cpu_qlen_dec(q);
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
			skb = NULL;
		}
	}

	if (lock)
		spin_unlock(lock);

	return skb;
}

static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
{
	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);

	if (unlikely(skb))
		skb = __skb_dequeue_bad_txq(q);

	return skb;
}

static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
					     struct sk_buff *skb)
{
	spinlock_t *lock = NULL;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	__skb_queue_tail(&q->skb_bad_txq, skb);

E
Eric Dumazet 已提交
109 110 111 112 113 114 115 116
	if (qdisc_is_percpu_stats(q)) {
		qdisc_qstats_cpu_backlog_inc(q, skb);
		qdisc_qstats_cpu_qlen_inc(q);
	} else {
		qdisc_qstats_backlog_inc(q, skb);
		q->q.qlen++;
	}

117 118 119 120
	if (lock)
		spin_unlock(lock);
}

121
static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
122
{
123 124 125 126 127 128 129 130 131 132
	while (skb) {
		struct sk_buff *next = skb->next;

		__skb_queue_tail(&q->gso_skb, skb);
		q->qstats.requeues++;
		qdisc_qstats_backlog_inc(q, skb);
		q->q.qlen++;	/* it's still part of the queue */

		skb = next;
	}
133
	__netif_schedule(q);
134

135 136 137
	return 0;
}

138 139 140 141 142
static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
{
	spinlock_t *lock = qdisc_lock(q);

	spin_lock(lock);
143 144 145 146 147 148 149 150 151 152 153
	while (skb) {
		struct sk_buff *next = skb->next;

		__skb_queue_tail(&q->gso_skb, skb);

		qdisc_qstats_cpu_requeues_inc(q);
		qdisc_qstats_cpu_backlog_inc(q, skb);
		qdisc_qstats_cpu_qlen_inc(q);

		skb = next;
	}
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
	spin_unlock(lock);

	__netif_schedule(q);

	return 0;
}

static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
	if (q->flags & TCQ_F_NOLOCK)
		return dev_requeue_skb_locked(skb, q);
	else
		return __dev_requeue_skb(skb, q);
}

169 170
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
171 172
				 const struct netdev_queue *txq,
				 int *packets)
173
{
174
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
175 176

	while (bytelimit > 0) {
177
		struct sk_buff *nskb = q->dequeue(q);
178

179
		if (!nskb)
180 181
			break;

182 183 184
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
185
		(*packets)++; /* GSO counts as one pkt */
186
	}
187
	skb_mark_not_on_list(skb);
188 189
}

190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
206
			qdisc_enqueue_skb_bad_txq(q, nskb);
207 208 209 210 211 212
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
213
	skb_mark_not_on_list(skb);
214 215
}

216 217 218
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
219 220
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
221
{
222
	const struct netdev_queue *txq = q->dev_queue;
223
	struct sk_buff *skb = NULL;
224

225
	*packets = 1;
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
		spinlock_t *lock = NULL;

		if (q->flags & TCQ_F_NOLOCK) {
			lock = qdisc_lock(q);
			spin_lock(lock);
		}

		skb = skb_peek(&q->gso_skb);

		/* skb may be null if another cpu pulls gso_skb off in between
		 * empty check and lock.
		 */
		if (!skb) {
			if (lock)
				spin_unlock(lock);
			goto validate;
		}

245 246
		/* skb in gso_skb were already validated */
		*validate = false;
247 248
		if (xfrm_offload(skb))
			*validate = true;
249
		/* check the reason of requeuing without tx lock first */
250
		txq = skb_get_tx_queue(txq->dev, skb);
251
		if (!netif_xmit_frozen_or_stopped(txq)) {
252 253 254 255 256 257 258 259 260
			skb = __skb_dequeue(&q->gso_skb);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
				qdisc_qstats_cpu_qlen_dec(q);
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
261
			skb = NULL;
262 263 264
		}
		if (lock)
			spin_unlock(lock);
265
		goto trace;
266
	}
267
validate:
268
	*validate = true;
269 270 271 272 273

	if ((q->flags & TCQ_F_ONETXQUEUE) &&
	    netif_xmit_frozen_or_stopped(txq))
		return skb;

274 275 276
	skb = qdisc_dequeue_skb_bad_txq(q);
	if (unlikely(skb))
		goto bulk;
277
	skb = q->dequeue(q);
278 279 280 281 282 283
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
284
	}
285 286
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
287 288 289
	return skb;
}

290
/*
291
 * Transmit possibly several skbs, and handle the return status as
292
 * required. Owning running seqcount bit guarantees that
293
 * only one CPU can execute this function.
294 295
 *
 * Returns to the caller:
296 297
 *				false  - hardware queue frozen backoff
 *				true   - feel free to send more pkts
298
 */
299 300 301
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		     struct net_device *dev, struct netdev_queue *txq,
		     spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
302
{
303
	int ret = NETDEV_TX_BUSY;
304
	bool again = false;
305 306

	/* And release qdisc */
307 308
	if (root_lock)
		spin_unlock(root_lock);
309

310 311
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
312 313 314 315 316 317 318 319 320 321 322
		skb = validate_xmit_skb_list(skb, dev, &again);

#ifdef CONFIG_XFRM_OFFLOAD
	if (unlikely(again)) {
		if (root_lock)
			spin_lock(root_lock);

		dev_requeue_skb(skb, q);
		return false;
	}
#endif
323

324
	if (likely(skb)) {
325 326 327
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
328

329
		HARD_TX_UNLOCK(dev, txq);
330
	} else {
331 332
		if (root_lock)
			spin_lock(root_lock);
333
		return true;
334
	}
335 336 337

	if (root_lock)
		spin_lock(root_lock);
338

339
	if (!dev_xmit_complete(ret)) {
340
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
341 342 343
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
344

345 346
		dev_requeue_skb(skb, q);
		return false;
347
	}
348

349
	return true;
L
Linus Torvalds 已提交
350 351
}

352 353 354
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
355
 * running seqcount guarantees only one CPU can process
356 357 358 359 360 361 362 363 364 365 366 367 368 369 370
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
371
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
372
{
373
	spinlock_t *root_lock = NULL;
374 375 376
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
377
	bool validate;
378 379

	/* Dequeue packet */
380
	skb = dequeue_skb(q, &validate, packets);
381
	if (unlikely(!skb))
382
		return false;
383

384
	if (!(q->flags & TCQ_F_NOLOCK))
385 386
		root_lock = qdisc_lock(q);

387
	dev = qdisc_dev(q);
388
	txq = skb_get_tx_queue(dev, skb);
389

390
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
391 392
}

393
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
394
{
395
	int quota = dev_tx_weight;
396
	int packets;
397

398
	while (qdisc_restart(q, &packets)) {
399
		/*
J
jamal 已提交
400 401 402
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
403
		 */
404 405
		quota -= packets;
		if (quota <= 0 || need_resched()) {
406
			__netif_schedule(q);
407
			break;
408 409
		}
	}
H
Herbert Xu 已提交
410 411
}

412 413
unsigned long dev_trans_start(struct net_device *dev)
{
414
	unsigned long val, res;
415 416
	unsigned int i;

417 418
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
419 420
	else if (netif_is_macvlan(dev))
		dev = macvlan_dev_real_dev(dev);
F
Florian Westphal 已提交
421 422
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
423 424 425 426
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
427

428 429 430 431
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

432
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
433
{
434
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
435

H
Herbert Xu 已提交
436
	netif_tx_lock(dev);
437
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
438 439 440
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
441
			int some_queue_timedout = 0;
442
			unsigned int i;
443
			unsigned long trans_start;
444 445 446 447 448

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
449
				trans_start = txq->trans_start;
450
				if (netif_xmit_stopped(txq) &&
451 452 453
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
454
					txq->trans_timeout++;
455 456 457
					break;
				}
			}
458

459 460
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
461
				       dev->name, netdev_drivername(dev), i);
462
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
463
			}
464 465 466
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
467 468 469
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
470
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
471 472 473 474 475 476

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
477
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
478 479
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
480 481
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
482 483 484 485 486 487 488 489 490 491 492
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
493
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
494
	if (del_timer(&dev->watchdog_timer))
495
		dev_put(dev);
H
Herbert Xu 已提交
496
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
497 498
}

499 500 501 502 503 504
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
505 506
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
507
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
508 509
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
510
		atomic_inc(&dev->carrier_up_count);
511
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
512 513 514
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
515
}
516
EXPORT_SYMBOL(netif_carrier_on);
517

518 519 520 521 522 523
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
524 525
void netif_carrier_off(struct net_device *dev)
{
526 527 528
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
529
		atomic_inc(&dev->carrier_down_count);
530
		linkwatch_fire_event(dev);
531
	}
532
}
533
EXPORT_SYMBOL(netif_carrier_off);
534

L
Linus Torvalds 已提交
535 536 537 538 539
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

540 541
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
542
{
543
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
544 545 546
	return NET_XMIT_CN;
}

547
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
548 549 550 551
{
	return NULL;
}

552
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
553 554 555 556
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
557
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
558 559 560
	.owner		=	THIS_MODULE,
};

561 562
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
563
	.qdisc_sleeping	=	&noop_qdisc,
564 565
};

L
Linus Torvalds 已提交
566 567 568 569
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
570
	.ops		=	&noop_qdisc_ops,
571
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
572
	.dev_queue	=	&noop_netdev_queue,
573
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
574
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
575
};
576
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
577

578 579
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
			struct netlink_ext_ack *extack)
P
Phil Sutter 已提交
580 581 582 583 584 585 586 587 588
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
589 590
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
591
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
592 593
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
594
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
595 596 597
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
598 599 600
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
601 602 603 604 605 606 607

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

608 609
/*
 * Private data for a pfifo_fast scheduler containing:
610
 *	- rings for priority bands
611 612
 */
struct pfifo_fast_priv {
613
	struct skb_array q[PFIFO_FAST_BANDS];
614 615
};

616 617
static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
					  int band)
618
{
619
	return &priv->q[band];
620 621
}

622 623
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
624
{
625 626 627
	int band = prio2band[skb->priority & TC_PRIO_MAX];
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	struct skb_array *q = band2list(priv, band);
E
Eric Dumazet 已提交
628
	unsigned int pkt_len = qdisc_pkt_len(skb);
629
	int err;
630

631 632 633 634 635 636
	err = skb_array_produce(q, skb);

	if (unlikely(err))
		return qdisc_drop_cpu(skb, qdisc, to_free);

	qdisc_qstats_cpu_qlen_inc(qdisc);
E
Eric Dumazet 已提交
637 638 639 640
	/* Note: skb can not be used after skb_array_produce(),
	 * so we better not use qdisc_qstats_cpu_backlog_inc()
	 */
	this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len);
641
	return NET_XMIT_SUCCESS;
L
Linus Torvalds 已提交
642 643
}

E
Eric Dumazet 已提交
644
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
645
{
646
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
647 648
	struct sk_buff *skb = NULL;
	int band;
649

650 651
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
652

653 654
		if (__skb_array_empty(q))
			continue;
655

656
		skb = __skb_array_consume(q);
657 658 659 660 661
	}
	if (likely(skb)) {
		qdisc_qstats_cpu_backlog_dec(qdisc, skb);
		qdisc_bstats_cpu_update(qdisc, skb);
		qdisc_qstats_cpu_qlen_dec(qdisc);
662
	}
663

664
	return skb;
L
Linus Torvalds 已提交
665 666
}

E
Eric Dumazet 已提交
667
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
668
{
669
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
670 671
	struct sk_buff *skb = NULL;
	int band;
672

673 674
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
675

676
		skb = __skb_array_peek(q);
677 678
	}

679
	return skb;
680 681
}

E
Eric Dumazet 已提交
682
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
683
{
684
	int i, band;
685
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
686

687 688 689
	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
		struct skb_array *q = band2list(priv, band);
		struct sk_buff *skb;
690

691 692 693 694 695 696
		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
		if (!q->ring.queue)
			continue;

697
		while ((skb = __skb_array_consume(q)) != NULL)
698 699 700 701 702 703 704 705 706
			kfree_skb(skb);
	}

	for_each_possible_cpu(i) {
		struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);

		q->backlog = 0;
		q->qlen = 0;
	}
L
Linus Torvalds 已提交
707 708
}

709 710 711 712
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
713
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
714 715
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
716 717 718 719 720 721
	return skb->len;

nla_put_failure:
	return -1;
}

722 723
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
			   struct netlink_ext_ack *extack)
724
{
725
	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
726
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
727 728 729 730 731
	int prio;

	/* guard against zero length rings */
	if (!qlen)
		return -EINVAL;
732

733 734 735 736 737 738 739 740
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);
		int err;

		err = skb_array_init(q, qlen, GFP_KERNEL);
		if (err)
			return -ENOMEM;
	}
741

742 743
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
744 745 746
	return 0;
}

747 748 749 750 751 752 753 754 755 756 757
static void pfifo_fast_destroy(struct Qdisc *sch)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
758
		if (!q->ring.queue)
759 760 761 762 763 764 765 766
			continue;
		/* Destroy ring but no need to kfree_skb because a call to
		 * pfifo_fast_reset() has already done that work.
		 */
		ptr_ring_cleanup(&q->ring, NULL);
	}
}

767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783
static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
					  unsigned int new_len)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	struct skb_array *bands[PFIFO_FAST_BANDS];
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		bands[prio] = q;
	}

	return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
					 GFP_KERNEL);
}

784
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
785
	.id		=	"pfifo_fast",
786
	.priv_size	=	sizeof(struct pfifo_fast_priv),
787 788
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
789
	.peek		=	pfifo_fast_peek,
790
	.init		=	pfifo_fast_init,
791
	.destroy	=	pfifo_fast_destroy,
792 793
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
794
	.change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
L
Linus Torvalds 已提交
795
	.owner		=	THIS_MODULE,
796
	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
L
Linus Torvalds 已提交
797
};
798
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
799

800
static struct lock_class_key qdisc_tx_busylock;
801
static struct lock_class_key qdisc_running_key;
802

803
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
804 805
			  const struct Qdisc_ops *ops,
			  struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
806 807 808
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
809
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
810
	int err = -ENOBUFS;
811 812 813
	struct net_device *dev;

	if (!dev_queue) {
814
		NL_SET_ERR_MSG(extack, "No device queue given");
815 816 817
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
818

819
	dev = dev_queue->dev;
820 821 822
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
823
	if (!p)
824 825
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
826 827 828 829 830 831 832 833 834 835
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
836
	__skb_queue_head_init(&sch->gso_skb);
837
	__skb_queue_head_init(&sch->skb_bad_txq);
838 839
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
840

841 842 843 844 845 846 847 848 849 850 851 852 853
	if (ops->static_flags & TCQ_F_CPUSTATS) {
		sch->cpu_bstats =
			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
		if (!sch->cpu_bstats)
			goto errout1;

		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
		if (!sch->cpu_qstats) {
			free_percpu(sch->cpu_bstats);
			goto errout1;
		}
	}

854
	spin_lock_init(&sch->busylock);
855 856 857
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

858 859 860 861 862
	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
	spin_lock_init(&sch->seqlock);
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

863 864 865 866
	seqcount_init(&sch->running);
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);

L
Linus Torvalds 已提交
867
	sch->ops = ops;
868
	sch->flags = ops->static_flags;
L
Linus Torvalds 已提交
869 870
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
871
	sch->dev_queue = dev_queue;
872
	dev_hold(dev);
873
	refcount_set(&sch->refcnt, 1);
874 875

	return sch;
876 877
errout1:
	kfree(p);
878
errout:
879
	return ERR_PTR(err);
880 881
}

882
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
883
				const struct Qdisc_ops *ops,
884 885
				unsigned int parentid,
				struct netlink_ext_ack *extack)
886 887
{
	struct Qdisc *sch;
888

889 890
	if (!try_module_get(ops->owner)) {
		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
891
		return NULL;
892
	}
893

894
	sch = qdisc_alloc(dev_queue, ops, extack);
895 896 897 898
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
899
	sch->parent = parentid;
900

901
	if (!ops->init || ops->init(sch, NULL, extack) == 0)
L
Linus Torvalds 已提交
902 903
		return sch;

904
	qdisc_put(sch);
L
Linus Torvalds 已提交
905 906
	return NULL;
}
907
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
908

909
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
910 911 912

void qdisc_reset(struct Qdisc *qdisc)
{
913
	const struct Qdisc_ops *ops = qdisc->ops;
914
	struct sk_buff *skb, *tmp;
L
Linus Torvalds 已提交
915 916 917

	if (ops->reset)
		ops->reset(qdisc);
918

919 920 921
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
922
	}
923

924 925 926 927 928
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

929
	qdisc->q.qlen = 0;
930
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
931
}
932
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
933

934
void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
935
{
936
	if (qdisc_is_percpu_stats(qdisc)) {
937
		free_percpu(qdisc->cpu_bstats);
938 939
		free_percpu(qdisc->cpu_qstats);
	}
940

E
Eric Dumazet 已提交
941 942 943
	kfree((char *) qdisc - qdisc->padded);
}

V
Vlad Buslov 已提交
944 945 946 947 948 949 950
void qdisc_free_cb(struct rcu_head *head)
{
	struct Qdisc *q = container_of(head, struct Qdisc, rcu);

	qdisc_free(q);
}

951
static void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
952
{
953
	const struct Qdisc_ops  *ops = qdisc->ops;
954
	struct sk_buff *skb, *tmp;
955

956
#ifdef CONFIG_NET_SCHED
957
	qdisc_hash_del(qdisc);
958

E
Eric Dumazet 已提交
959
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
960
#endif
961
	gen_kill_estimator(&qdisc->rate_est);
962 963 964 965 966 967 968 969
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

970 971 972 973 974
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
	}

975 976 977 978 979
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

V
Vlad Buslov 已提交
980
	call_rcu(&qdisc->rcu, qdisc_free_cb);
L
Linus Torvalds 已提交
981
}
982 983 984 985 986 987 988 989 990 991

void qdisc_put(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_test(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
L
Linus Torvalds 已提交
992

V
Vlad Buslov 已提交
993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008
/* Version of qdisc_put() that is called with rtnl mutex unlocked.
 * Intended to be used as optimization, this function only takes rtnl lock if
 * qdisc reference counter reached zero.
 */

void qdisc_put_unlocked(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
	rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);

1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
1029
EXPORT_SYMBOL(dev_graft_qdisc);
1030

1031 1032 1033 1034
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
1035 1036
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
1037

1038 1039 1040
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;

1041
	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
1042 1043 1044
	if (!qdisc) {
		netdev_info(dev, "activation failed\n");
		return;
1045
	}
1046
	if (!netif_is_multiqueue(dev))
1047
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1048 1049 1050
	dev_queue->qdisc_sleeping = qdisc;
}

1051 1052 1053 1054 1055 1056 1057
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

1058 1059
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
1060 1061
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
1062
		qdisc_refcount_inc(dev->qdisc);
1063
	} else {
1064
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
1065 1066
		if (qdisc) {
			dev->qdisc = qdisc;
1067
			qdisc->ops->attach(qdisc);
1068 1069
		}
	}
1070
#ifdef CONFIG_NET_SCHED
1071
	if (dev->qdisc != &noop_qdisc)
1072
		qdisc_hash_add(dev->qdisc, false);
1073
#endif
1074 1075
}

1076 1077 1078 1079
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
1080
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
1081 1082
	int *need_watchdog_p = _need_watchdog;

1083 1084 1085
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

1086
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
1087
	if (need_watchdog_p) {
1088
		dev_queue->trans_start = 0;
1089
		*need_watchdog_p = 1;
1090
	}
1091 1092
}

L
Linus Torvalds 已提交
1093 1094
void dev_activate(struct net_device *dev)
{
1095
	int need_watchdog;
1096

L
Linus Torvalds 已提交
1097
	/* No queueing discipline is attached to device;
1098 1099
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
1100 1101
	 */

1102 1103
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
1104

1105 1106 1107 1108
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

1109 1110
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
1111 1112
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
1113 1114

	if (need_watchdog) {
1115
		netif_trans_update(dev);
L
Linus Torvalds 已提交
1116 1117
		dev_watchdog_up(dev);
	}
1118
}
1119
EXPORT_SYMBOL(dev_activate);
1120

1121 1122 1123
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
1124
{
1125
	struct Qdisc *qdisc_default = _qdisc_default;
1126 1127
	struct Qdisc *qdisc;

1128
	qdisc = rtnl_dereference(dev_queue->qdisc);
1129
	if (qdisc) {
1130 1131 1132 1133
		bool nolock = qdisc->flags & TCQ_F_NOLOCK;

		if (nolock)
			spin_lock_bh(&qdisc->seqlock);
1134 1135
		spin_lock_bh(qdisc_lock(qdisc));

1136 1137 1138
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

1139
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1140
		qdisc_reset(qdisc);
1141

1142
		spin_unlock_bh(qdisc_lock(qdisc));
1143 1144
		if (nolock)
			spin_unlock_bh(&qdisc->seqlock);
1145
	}
L
Linus Torvalds 已提交
1146 1147
}

1148
static bool some_qdisc_is_busy(struct net_device *dev)
1149 1150 1151 1152 1153
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
1154
		spinlock_t *root_lock;
1155
		struct Qdisc *q;
1156 1157 1158
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
1159
		q = dev_queue->qdisc_sleeping;
1160

1161 1162
		root_lock = qdisc_lock(q);
		spin_lock_bh(root_lock);
1163

1164 1165
		val = (qdisc_is_running(q) ||
		       test_bit(__QDISC_STATE_SCHED, &q->state));
1166

1167
		spin_unlock_bh(root_lock);
1168 1169 1170 1171 1172 1173 1174

		if (val)
			return true;
	}
	return false;
}

1175 1176 1177 1178 1179 1180 1181 1182 1183 1184
static void dev_qdisc_reset(struct net_device *dev,
			    struct netdev_queue *dev_queue,
			    void *none)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;

	if (qdisc)
		qdisc_reset(qdisc);
}

1185 1186 1187 1188 1189 1190 1191
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
1192
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
1193
{
1194
	struct net_device *dev;
1195

1196
	list_for_each_entry(dev, head, close_list) {
1197 1198 1199 1200 1201 1202 1203 1204
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
	}
L
Linus Torvalds 已提交
1205

1206 1207 1208 1209
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
1210
	synchronize_net();
L
Linus Torvalds 已提交
1211

1212
	/* Wait for outstanding qdisc_run calls. */
1213
	list_for_each_entry(dev, head, close_list) {
1214 1215
		while (some_qdisc_is_busy(dev))
			yield();
1216 1217 1218 1219 1220 1221 1222
		/* The new qdisc is assigned at this point so we can safely
		 * unwind stale skb lists and qdisc statistics
		 */
		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
		if (dev_ingress_queue(dev))
			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
	}
1223 1224 1225 1226 1227 1228
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

1229
	list_add(&dev->close_list, &single);
1230
	dev_deactivate_many(&single);
1231
	list_del(&single);
L
Linus Torvalds 已提交
1232
}
1233
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
1234

1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267
static int qdisc_change_tx_queue_len(struct net_device *dev,
				     struct netdev_queue *dev_queue)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
	const struct Qdisc_ops *ops = qdisc->ops;

	if (ops->change_tx_queue_len)
		return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
	return 0;
}

int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
	bool up = dev->flags & IFF_UP;
	unsigned int i;
	int ret = 0;

	if (up)
		dev_deactivate(dev);

	for (i = 0; i < dev->num_tx_queues; i++) {
		ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);

		/* TODO: revert changes on a partial failure */
		if (ret)
			break;
	}

	if (up)
		dev_activate(dev);
	return ret;
}

1268 1269
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
1270
				     void *_qdisc)
1271
{
1272 1273
	struct Qdisc *qdisc = _qdisc;

1274
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
1275
	dev_queue->qdisc_sleeping = qdisc;
1276
	__skb_queue_head_init(&qdisc->gso_skb);
1277
	__skb_queue_head_init(&qdisc->skb_bad_txq);
1278 1279
}

L
Linus Torvalds 已提交
1280 1281
void dev_init_scheduler(struct net_device *dev)
{
1282
	dev->qdisc = &noop_qdisc;
1283
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
1284 1285
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
1286

1287
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
1288 1289
}

1290 1291 1292
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
1293
{
1294
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
1295
	struct Qdisc *qdisc_default = _qdisc_default;
1296 1297

	if (qdisc) {
1298
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1299
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
1300

1301
		qdisc_put(qdisc);
1302
	}
1303 1304 1305 1306
}

void dev_shutdown(struct net_device *dev)
{
1307
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1308 1309
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1310
	qdisc_put(dev->qdisc);
1311 1312
	dev->qdisc = &noop_qdisc;

1313
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
1314
}
1315

1316
void psched_ratecfg_precompute(struct psched_ratecfg *r,
1317 1318
			       const struct tc_ratespec *conf,
			       u64 rate64)
1319
{
1320 1321
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1322
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1323
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1324 1325
	r->mult = 1;
	/*
1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1337
	 */
1338 1339 1340 1341 1342 1343
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1344
				break;
1345 1346
			factor <<= 1;
			r->shift++;
1347 1348 1349 1350
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
	struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
1364 1365
		/* Wait for flying RCU callback before it is freed. */
		rcu_barrier_bh();
1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
	 * we are about to modify. So wait until previous call_rcu_bh callback
	 * is done.
	 */
	rcu_barrier_bh();
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
1381
		/* This is counterpart of the rcu barriers above. We need to
1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
		call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);