sch_generic.c 32.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <linux/skb_array.h>
30
#include <linux/if_macvlan.h>
31
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
32
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
33
#include <net/dst.h>
34
#include <trace/events/qdisc.h>
35
#include <net/xfrm.h>
L
Linus Torvalds 已提交
36

37 38 39 40
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
41 42
/* Main transmission queue. */

43
/* Modifications to data participating in scheduling must be protected with
44
 * qdisc_lock(qdisc) spinlock.
45 46
 *
 * The idea is the following:
47 48
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
49
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
50
 */
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108

static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
	const struct netdev_queue *txq = q->dev_queue;
	spinlock_t *lock = NULL;
	struct sk_buff *skb;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	skb = skb_peek(&q->skb_bad_txq);
	if (skb) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			skb = __skb_dequeue(&q->skb_bad_txq);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
				qdisc_qstats_cpu_qlen_dec(q);
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
			skb = NULL;
		}
	}

	if (lock)
		spin_unlock(lock);

	return skb;
}

static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
{
	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);

	if (unlikely(skb))
		skb = __skb_dequeue_bad_txq(q);

	return skb;
}

static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
					     struct sk_buff *skb)
{
	spinlock_t *lock = NULL;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	__skb_queue_tail(&q->skb_bad_txq, skb);

E
Eric Dumazet 已提交
109 110 111 112 113 114 115 116
	if (qdisc_is_percpu_stats(q)) {
		qdisc_qstats_cpu_backlog_inc(q, skb);
		qdisc_qstats_cpu_qlen_inc(q);
	} else {
		qdisc_qstats_backlog_inc(q, skb);
		q->q.qlen++;
	}

117 118 119 120
	if (lock)
		spin_unlock(lock);
}

121
static inline int __dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
122
{
123 124 125 126 127 128 129 130 131 132
	while (skb) {
		struct sk_buff *next = skb->next;

		__skb_queue_tail(&q->gso_skb, skb);
		q->qstats.requeues++;
		qdisc_qstats_backlog_inc(q, skb);
		q->q.qlen++;	/* it's still part of the queue */

		skb = next;
	}
133
	__netif_schedule(q);
134

135 136 137
	return 0;
}

138 139 140 141 142
static inline int dev_requeue_skb_locked(struct sk_buff *skb, struct Qdisc *q)
{
	spinlock_t *lock = qdisc_lock(q);

	spin_lock(lock);
143 144 145 146 147 148 149 150 151 152 153
	while (skb) {
		struct sk_buff *next = skb->next;

		__skb_queue_tail(&q->gso_skb, skb);

		qdisc_qstats_cpu_requeues_inc(q);
		qdisc_qstats_cpu_backlog_inc(q, skb);
		qdisc_qstats_cpu_qlen_inc(q);

		skb = next;
	}
154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
	spin_unlock(lock);

	__netif_schedule(q);

	return 0;
}

static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
{
	if (q->flags & TCQ_F_NOLOCK)
		return dev_requeue_skb_locked(skb, q);
	else
		return __dev_requeue_skb(skb, q);
}

169 170
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
171 172
				 const struct netdev_queue *txq,
				 int *packets)
173
{
174
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
175 176

	while (bytelimit > 0) {
177
		struct sk_buff *nskb = q->dequeue(q);
178

179
		if (!nskb)
180 181
			break;

182 183 184
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
185
		(*packets)++; /* GSO counts as one pkt */
186
	}
187
	skb->next = NULL;
188 189
}

190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
206
			qdisc_enqueue_skb_bad_txq(q, nskb);
207 208 209 210 211 212 213 214 215
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
	skb->next = NULL;
}

216 217 218
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
219 220
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
221
{
222
	const struct netdev_queue *txq = q->dev_queue;
223
	struct sk_buff *skb = NULL;
224

225
	*packets = 1;
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244
	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
		spinlock_t *lock = NULL;

		if (q->flags & TCQ_F_NOLOCK) {
			lock = qdisc_lock(q);
			spin_lock(lock);
		}

		skb = skb_peek(&q->gso_skb);

		/* skb may be null if another cpu pulls gso_skb off in between
		 * empty check and lock.
		 */
		if (!skb) {
			if (lock)
				spin_unlock(lock);
			goto validate;
		}

245 246
		/* skb in gso_skb were already validated */
		*validate = false;
247 248
		if (xfrm_offload(skb))
			*validate = true;
249
		/* check the reason of requeuing without tx lock first */
250
		txq = skb_get_tx_queue(txq->dev, skb);
251
		if (!netif_xmit_frozen_or_stopped(txq)) {
252 253 254 255 256 257 258 259 260
			skb = __skb_dequeue(&q->gso_skb);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
				qdisc_qstats_cpu_qlen_dec(q);
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
261
			skb = NULL;
262 263 264
		}
		if (lock)
			spin_unlock(lock);
265
		goto trace;
266
	}
267
validate:
268
	*validate = true;
269 270 271 272 273

	if ((q->flags & TCQ_F_ONETXQUEUE) &&
	    netif_xmit_frozen_or_stopped(txq))
		return skb;

274 275 276
	skb = qdisc_dequeue_skb_bad_txq(q);
	if (unlikely(skb))
		goto bulk;
277
	skb = q->dequeue(q);
278 279 280 281 282 283
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
284
	}
285 286
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
287 288 289
	return skb;
}

290
/*
291
 * Transmit possibly several skbs, and handle the return status as
292
 * required. Owning running seqcount bit guarantees that
293
 * only one CPU can execute this function.
294 295
 *
 * Returns to the caller:
296 297
 *				false  - hardware queue frozen backoff
 *				true   - feel free to send more pkts
298
 */
299 300 301
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		     struct net_device *dev, struct netdev_queue *txq,
		     spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
302
{
303
	int ret = NETDEV_TX_BUSY;
304
	bool again = false;
305 306

	/* And release qdisc */
307 308
	if (root_lock)
		spin_unlock(root_lock);
309

310 311
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
312 313 314 315 316 317 318 319 320 321 322
		skb = validate_xmit_skb_list(skb, dev, &again);

#ifdef CONFIG_XFRM_OFFLOAD
	if (unlikely(again)) {
		if (root_lock)
			spin_lock(root_lock);

		dev_requeue_skb(skb, q);
		return false;
	}
#endif
323

324
	if (likely(skb)) {
325 326 327
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
328

329
		HARD_TX_UNLOCK(dev, txq);
330
	} else {
331 332
		if (root_lock)
			spin_lock(root_lock);
333
		return true;
334
	}
335 336 337

	if (root_lock)
		spin_lock(root_lock);
338

339
	if (!dev_xmit_complete(ret)) {
340
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
341 342 343
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
344

345 346
		dev_requeue_skb(skb, q);
		return false;
347
	}
348

349
	if (ret && netif_xmit_frozen_or_stopped(txq))
350
		return false;
351

352
	return true;
L
Linus Torvalds 已提交
353 354
}

355 356 357
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
358
 * running seqcount guarantees only one CPU can process
359 360 361 362 363 364 365 366 367 368 369 370 371 372 373
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
374
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
375
{
376
	spinlock_t *root_lock = NULL;
377 378 379
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
380
	bool validate;
381 382

	/* Dequeue packet */
383
	skb = dequeue_skb(q, &validate, packets);
384
	if (unlikely(!skb))
385
		return false;
386

387
	if (!(q->flags & TCQ_F_NOLOCK))
388 389
		root_lock = qdisc_lock(q);

390
	dev = qdisc_dev(q);
391
	txq = skb_get_tx_queue(dev, skb);
392

393
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
394 395
}

396
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
397
{
398
	int quota = dev_tx_weight;
399
	int packets;
400

401
	while (qdisc_restart(q, &packets)) {
402
		/*
J
jamal 已提交
403 404 405
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
406
		 */
407 408
		quota -= packets;
		if (quota <= 0 || need_resched()) {
409
			__netif_schedule(q);
410
			break;
411 412
		}
	}
H
Herbert Xu 已提交
413 414
}

415 416
unsigned long dev_trans_start(struct net_device *dev)
{
417
	unsigned long val, res;
418 419
	unsigned int i;

420 421
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
422 423
	else if (netif_is_macvlan(dev))
		dev = macvlan_dev_real_dev(dev);
F
Florian Westphal 已提交
424 425
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
426 427 428 429
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
430

431 432 433 434
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

435
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
436
{
437
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
438

H
Herbert Xu 已提交
439
	netif_tx_lock(dev);
440
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
441 442 443
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
444
			int some_queue_timedout = 0;
445
			unsigned int i;
446
			unsigned long trans_start;
447 448 449 450 451

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
452
				trans_start = txq->trans_start;
453
				if (netif_xmit_stopped(txq) &&
454 455 456
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
457
					txq->trans_timeout++;
458 459 460
					break;
				}
			}
461

462 463
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
464
				       dev->name, netdev_drivername(dev), i);
465
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
466
			}
467 468 469
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
470 471 472
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
473
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
474 475 476 477 478 479

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
480
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
481 482
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
483 484
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
485 486 487 488 489 490 491 492 493 494 495
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
496
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
497
	if (del_timer(&dev->watchdog_timer))
498
		dev_put(dev);
H
Herbert Xu 已提交
499
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
500 501
}

502 503 504 505 506 507
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
508 509
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
510
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
511 512
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
513
		atomic_inc(&dev->carrier_up_count);
514
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
515 516 517
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
518
}
519
EXPORT_SYMBOL(netif_carrier_on);
520

521 522 523 524 525 526
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
527 528
void netif_carrier_off(struct net_device *dev)
{
529 530 531
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
532
		atomic_inc(&dev->carrier_down_count);
533
		linkwatch_fire_event(dev);
534
	}
535
}
536
EXPORT_SYMBOL(netif_carrier_off);
537

L
Linus Torvalds 已提交
538 539 540 541 542
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

543 544
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
545
{
546
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
547 548 549
	return NET_XMIT_CN;
}

550
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
551 552 553 554
{
	return NULL;
}

555
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
556 557 558 559
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
560
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
561 562 563
	.owner		=	THIS_MODULE,
};

564 565
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
566
	.qdisc_sleeping	=	&noop_qdisc,
567 568
};

L
Linus Torvalds 已提交
569 570 571 572
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
573
	.ops		=	&noop_qdisc_ops,
574
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
575
	.dev_queue	=	&noop_netdev_queue,
576
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
577
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
578
};
579
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
580

581 582
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
			struct netlink_ext_ack *extack)
P
Phil Sutter 已提交
583 584 585 586 587 588 589 590 591
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
592 593
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
594
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
595 596
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
597
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
598 599 600
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
601 602 603
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
604 605 606 607 608 609 610

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

611 612
/*
 * Private data for a pfifo_fast scheduler containing:
613
 *	- rings for priority bands
614 615
 */
struct pfifo_fast_priv {
616
	struct skb_array q[PFIFO_FAST_BANDS];
617 618
};

619 620
static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
					  int band)
621
{
622
	return &priv->q[band];
623 624
}

625 626
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
627
{
628 629 630
	int band = prio2band[skb->priority & TC_PRIO_MAX];
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	struct skb_array *q = band2list(priv, band);
E
Eric Dumazet 已提交
631
	unsigned int pkt_len = qdisc_pkt_len(skb);
632
	int err;
633

634 635 636 637 638 639
	err = skb_array_produce(q, skb);

	if (unlikely(err))
		return qdisc_drop_cpu(skb, qdisc, to_free);

	qdisc_qstats_cpu_qlen_inc(qdisc);
E
Eric Dumazet 已提交
640 641 642 643
	/* Note: skb can not be used after skb_array_produce(),
	 * so we better not use qdisc_qstats_cpu_backlog_inc()
	 */
	this_cpu_add(qdisc->cpu_qstats->backlog, pkt_len);
644
	return NET_XMIT_SUCCESS;
L
Linus Torvalds 已提交
645 646
}

E
Eric Dumazet 已提交
647
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
648
{
649
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
650 651
	struct sk_buff *skb = NULL;
	int band;
652

653 654
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
655

656 657
		if (__skb_array_empty(q))
			continue;
658

659 660 661 662 663 664
		skb = skb_array_consume_bh(q);
	}
	if (likely(skb)) {
		qdisc_qstats_cpu_backlog_dec(qdisc, skb);
		qdisc_bstats_cpu_update(qdisc, skb);
		qdisc_qstats_cpu_qlen_dec(qdisc);
665
	}
666

667
	return skb;
L
Linus Torvalds 已提交
668 669
}

E
Eric Dumazet 已提交
670
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
671
{
672
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
673 674
	struct sk_buff *skb = NULL;
	int band;
675

676 677
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
678

679
		skb = __skb_array_peek(q);
680 681
	}

682
	return skb;
683 684
}

E
Eric Dumazet 已提交
685
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
686
{
687
	int i, band;
688
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
689

690 691 692
	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
		struct skb_array *q = band2list(priv, band);
		struct sk_buff *skb;
693

694 695 696 697 698 699
		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
		if (!q->ring.queue)
			continue;

700 701 702 703 704 705 706 707 708 709
		while ((skb = skb_array_consume_bh(q)) != NULL)
			kfree_skb(skb);
	}

	for_each_possible_cpu(i) {
		struct gnet_stats_queue *q = per_cpu_ptr(qdisc->cpu_qstats, i);

		q->backlog = 0;
		q->qlen = 0;
	}
L
Linus Torvalds 已提交
710 711
}

712 713 714 715
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
716
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
717 718
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
719 720 721 722 723 724
	return skb->len;

nla_put_failure:
	return -1;
}

725 726
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
			   struct netlink_ext_ack *extack)
727
{
728
	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
729
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
730 731 732 733 734
	int prio;

	/* guard against zero length rings */
	if (!qlen)
		return -EINVAL;
735

736 737 738 739 740 741 742 743
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);
		int err;

		err = skb_array_init(q, qlen, GFP_KERNEL);
		if (err)
			return -ENOMEM;
	}
744

745 746
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
747 748 749
	return 0;
}

750 751 752 753 754 755 756 757 758 759 760
static void pfifo_fast_destroy(struct Qdisc *sch)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
761
		if (!q->ring.queue)
762 763 764 765 766 767 768 769
			continue;
		/* Destroy ring but no need to kfree_skb because a call to
		 * pfifo_fast_reset() has already done that work.
		 */
		ptr_ring_cleanup(&q->ring, NULL);
	}
}

770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786
static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
					  unsigned int new_len)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	struct skb_array *bands[PFIFO_FAST_BANDS];
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		bands[prio] = q;
	}

	return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
					 GFP_KERNEL);
}

787
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
788
	.id		=	"pfifo_fast",
789
	.priv_size	=	sizeof(struct pfifo_fast_priv),
790 791
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
792
	.peek		=	pfifo_fast_peek,
793
	.init		=	pfifo_fast_init,
794
	.destroy	=	pfifo_fast_destroy,
795 796
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
797
	.change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
L
Linus Torvalds 已提交
798
	.owner		=	THIS_MODULE,
799
	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
L
Linus Torvalds 已提交
800
};
801
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
802

803
static struct lock_class_key qdisc_tx_busylock;
804
static struct lock_class_key qdisc_running_key;
805

806
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
807 808
			  const struct Qdisc_ops *ops,
			  struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
809 810 811
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
812
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
813
	int err = -ENOBUFS;
814 815 816
	struct net_device *dev;

	if (!dev_queue) {
817
		NL_SET_ERR_MSG(extack, "No device queue given");
818 819 820
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
821

822
	dev = dev_queue->dev;
823 824 825
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
826
	if (!p)
827 828
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
829 830 831 832 833 834 835 836 837 838
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
839
	__skb_queue_head_init(&sch->gso_skb);
840
	__skb_queue_head_init(&sch->skb_bad_txq);
841 842
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
843

844 845 846 847 848 849 850 851 852 853 854 855 856
	if (ops->static_flags & TCQ_F_CPUSTATS) {
		sch->cpu_bstats =
			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
		if (!sch->cpu_bstats)
			goto errout1;

		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
		if (!sch->cpu_qstats) {
			free_percpu(sch->cpu_bstats);
			goto errout1;
		}
	}

857
	spin_lock_init(&sch->busylock);
858 859 860
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

861 862 863 864 865
	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
	spin_lock_init(&sch->seqlock);
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

866 867 868 869
	seqcount_init(&sch->running);
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);

L
Linus Torvalds 已提交
870
	sch->ops = ops;
871
	sch->flags = ops->static_flags;
L
Linus Torvalds 已提交
872 873
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
874
	sch->dev_queue = dev_queue;
875
	dev_hold(dev);
876
	refcount_set(&sch->refcnt, 1);
877 878

	return sch;
879 880
errout1:
	kfree(p);
881
errout:
882
	return ERR_PTR(err);
883 884
}

885
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
886
				const struct Qdisc_ops *ops,
887 888
				unsigned int parentid,
				struct netlink_ext_ack *extack)
889 890
{
	struct Qdisc *sch;
891

892 893
	if (!try_module_get(ops->owner)) {
		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
894
		return NULL;
895
	}
896

897
	sch = qdisc_alloc(dev_queue, ops, extack);
898 899 900 901
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
902
	sch->parent = parentid;
903

904
	if (!ops->init || ops->init(sch, NULL, extack) == 0)
L
Linus Torvalds 已提交
905 906
		return sch;

907
	qdisc_destroy(sch);
L
Linus Torvalds 已提交
908 909
	return NULL;
}
910
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
911

912
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
913 914 915

void qdisc_reset(struct Qdisc *qdisc)
{
916
	const struct Qdisc_ops *ops = qdisc->ops;
917
	struct sk_buff *skb, *tmp;
L
Linus Torvalds 已提交
918 919 920

	if (ops->reset)
		ops->reset(qdisc);
921

922 923 924
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
925
	}
926

927 928 929 930 931
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

932
	qdisc->q.qlen = 0;
933
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
934
}
935
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
936

937
void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
938
{
939
	if (qdisc_is_percpu_stats(qdisc)) {
940
		free_percpu(qdisc->cpu_bstats);
941 942
		free_percpu(qdisc->cpu_qstats);
	}
943

E
Eric Dumazet 已提交
944 945 946
	kfree((char *) qdisc - qdisc->padded);
}

947
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
948
{
949
	const struct Qdisc_ops  *ops = qdisc->ops;
950
	struct sk_buff *skb, *tmp;
951

952
	if (qdisc->flags & TCQ_F_BUILTIN ||
953
	    !refcount_dec_and_test(&qdisc->refcnt))
954 955
		return;

956
#ifdef CONFIG_NET_SCHED
957
	qdisc_hash_del(qdisc);
958

E
Eric Dumazet 已提交
959
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
960
#endif
961
	gen_kill_estimator(&qdisc->rate_est);
962 963 964 965 966 967 968 969
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

970 971 972 973 974
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
	}

975 976 977 978 979
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

980
	qdisc_free(qdisc);
L
Linus Torvalds 已提交
981
}
982
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
983

984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
1004
EXPORT_SYMBOL(dev_graft_qdisc);
1005

1006 1007 1008 1009
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
1010 1011
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
1012

1013 1014 1015
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;

1016
	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
1017 1018 1019
	if (!qdisc) {
		netdev_info(dev, "activation failed\n");
		return;
1020
	}
1021
	if (!netif_is_multiqueue(dev))
1022
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1023 1024 1025
	dev_queue->qdisc_sleeping = qdisc;
}

1026 1027 1028 1029 1030 1031 1032
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

1033 1034
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
1035 1036
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
1037
		qdisc_refcount_inc(dev->qdisc);
1038
	} else {
1039
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
1040 1041
		if (qdisc) {
			dev->qdisc = qdisc;
1042
			qdisc->ops->attach(qdisc);
1043 1044
		}
	}
1045
#ifdef CONFIG_NET_SCHED
1046
	if (dev->qdisc != &noop_qdisc)
1047
		qdisc_hash_add(dev->qdisc, false);
1048
#endif
1049 1050
}

1051 1052 1053 1054
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
1055
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
1056 1057
	int *need_watchdog_p = _need_watchdog;

1058 1059 1060
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

1061
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
1062
	if (need_watchdog_p) {
1063
		dev_queue->trans_start = 0;
1064
		*need_watchdog_p = 1;
1065
	}
1066 1067
}

L
Linus Torvalds 已提交
1068 1069
void dev_activate(struct net_device *dev)
{
1070
	int need_watchdog;
1071

L
Linus Torvalds 已提交
1072
	/* No queueing discipline is attached to device;
1073 1074
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
1075 1076
	 */

1077 1078
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
1079

1080 1081 1082 1083
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

1084 1085
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
1086 1087
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
1088 1089

	if (need_watchdog) {
1090
		netif_trans_update(dev);
L
Linus Torvalds 已提交
1091 1092
		dev_watchdog_up(dev);
	}
1093
}
1094
EXPORT_SYMBOL(dev_activate);
1095

1096 1097 1098
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
1099
{
1100
	struct Qdisc *qdisc_default = _qdisc_default;
1101 1102
	struct Qdisc *qdisc;

1103
	qdisc = rtnl_dereference(dev_queue->qdisc);
1104
	if (qdisc) {
1105 1106 1107 1108
		bool nolock = qdisc->flags & TCQ_F_NOLOCK;

		if (nolock)
			spin_lock_bh(&qdisc->seqlock);
1109 1110
		spin_lock_bh(qdisc_lock(qdisc));

1111 1112 1113
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

1114
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1115
		qdisc_reset(qdisc);
1116

1117
		spin_unlock_bh(qdisc_lock(qdisc));
1118 1119
		if (nolock)
			spin_unlock_bh(&qdisc->seqlock);
1120
	}
L
Linus Torvalds 已提交
1121 1122
}

1123
static bool some_qdisc_is_busy(struct net_device *dev)
1124 1125 1126 1127 1128
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
1129
		spinlock_t *root_lock;
1130
		struct Qdisc *q;
1131 1132 1133
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
1134
		q = dev_queue->qdisc_sleeping;
1135

1136 1137
		root_lock = qdisc_lock(q);
		spin_lock_bh(root_lock);
1138

1139 1140
		val = (qdisc_is_running(q) ||
		       test_bit(__QDISC_STATE_SCHED, &q->state));
1141

1142
		spin_unlock_bh(root_lock);
1143 1144 1145 1146 1147 1148 1149

		if (val)
			return true;
	}
	return false;
}

1150 1151 1152 1153 1154 1155 1156 1157 1158 1159
static void dev_qdisc_reset(struct net_device *dev,
			    struct netdev_queue *dev_queue,
			    void *none)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;

	if (qdisc)
		qdisc_reset(qdisc);
}

1160 1161 1162 1163 1164 1165 1166
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
1167
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
1168
{
1169
	struct net_device *dev;
1170

1171
	list_for_each_entry(dev, head, close_list) {
1172 1173 1174 1175 1176 1177 1178 1179
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
	}
L
Linus Torvalds 已提交
1180

1181 1182 1183 1184
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
1185
	synchronize_net();
L
Linus Torvalds 已提交
1186

1187
	/* Wait for outstanding qdisc_run calls. */
1188
	list_for_each_entry(dev, head, close_list) {
1189 1190
		while (some_qdisc_is_busy(dev))
			yield();
1191 1192 1193 1194 1195 1196 1197
		/* The new qdisc is assigned at this point so we can safely
		 * unwind stale skb lists and qdisc statistics
		 */
		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
		if (dev_ingress_queue(dev))
			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
	}
1198 1199 1200 1201 1202 1203
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

1204
	list_add(&dev->close_list, &single);
1205
	dev_deactivate_many(&single);
1206
	list_del(&single);
L
Linus Torvalds 已提交
1207
}
1208
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
1209

1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242
static int qdisc_change_tx_queue_len(struct net_device *dev,
				     struct netdev_queue *dev_queue)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
	const struct Qdisc_ops *ops = qdisc->ops;

	if (ops->change_tx_queue_len)
		return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
	return 0;
}

int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
	bool up = dev->flags & IFF_UP;
	unsigned int i;
	int ret = 0;

	if (up)
		dev_deactivate(dev);

	for (i = 0; i < dev->num_tx_queues; i++) {
		ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);

		/* TODO: revert changes on a partial failure */
		if (ret)
			break;
	}

	if (up)
		dev_activate(dev);
	return ret;
}

1243 1244
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
1245
				     void *_qdisc)
1246
{
1247 1248
	struct Qdisc *qdisc = _qdisc;

1249
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
1250
	dev_queue->qdisc_sleeping = qdisc;
1251
	__skb_queue_head_init(&qdisc->gso_skb);
1252
	__skb_queue_head_init(&qdisc->skb_bad_txq);
1253 1254
}

L
Linus Torvalds 已提交
1255 1256
void dev_init_scheduler(struct net_device *dev)
{
1257
	dev->qdisc = &noop_qdisc;
1258
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
1259 1260
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
1261

1262
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
1263 1264
}

1265 1266 1267
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
1268
{
1269
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
1270
	struct Qdisc *qdisc_default = _qdisc_default;
1271 1272

	if (qdisc) {
1273
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1274
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
1275 1276

		qdisc_destroy(qdisc);
1277
	}
1278 1279 1280 1281
}

void dev_shutdown(struct net_device *dev)
{
1282
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1283 1284
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1285 1286 1287
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

1288
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
1289
}
1290

1291
void psched_ratecfg_precompute(struct psched_ratecfg *r,
1292 1293
			       const struct tc_ratespec *conf,
			       u64 rate64)
1294
{
1295 1296
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1297
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1298
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1299 1300
	r->mult = 1;
	/*
1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1312
	 */
1313 1314 1315 1316 1317 1318
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1319
				break;
1320 1321
			factor <<= 1;
			r->shift++;
1322 1323 1324 1325
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
	struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
1339 1340
		/* Wait for flying RCU callback before it is freed. */
		rcu_barrier_bh();
1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
	 * we are about to modify. So wait until previous call_rcu_bh callback
	 * is done.
	 */
	rcu_barrier_bh();
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
1356
		/* This is counterpart of the rcu barriers above. We need to
1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
		call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);