sch_generic.c 32.5 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
23
#include <linux/slab.h>
24
#include <linux/if_vlan.h>
25
#include <linux/skb_array.h>
26
#include <linux/if_macvlan.h>
27
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
28
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
29
#include <net/dst.h>
30
#include <trace/events/qdisc.h>
31
#include <trace/events/net.h>
32
#include <net/xfrm.h>
L
Linus Torvalds 已提交
33

34 35 36 37
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
38 39
/* Main transmission queue. */

40
/* Modifications to data participating in scheduling must be protected with
41
 * qdisc_lock(qdisc) spinlock.
42 43
 *
 * The idea is the following:
44 45
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
46
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
47
 */
48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67

static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
	const struct netdev_queue *txq = q->dev_queue;
	spinlock_t *lock = NULL;
	struct sk_buff *skb;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	skb = skb_peek(&q->skb_bad_txq);
	if (skb) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			skb = __skb_dequeue(&q->skb_bad_txq);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
68
				qdisc_qstats_cpu_qlen_dec(q);
69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
			skb = NULL;
		}
	}

	if (lock)
		spin_unlock(lock);

	return skb;
}

static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
{
	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);

	if (unlikely(skb))
		skb = __skb_dequeue_bad_txq(q);

	return skb;
}

static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
					     struct sk_buff *skb)
{
	spinlock_t *lock = NULL;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	__skb_queue_tail(&q->skb_bad_txq, skb);

E
Eric Dumazet 已提交
106 107
	if (qdisc_is_percpu_stats(q)) {
		qdisc_qstats_cpu_backlog_inc(q, skb);
108
		qdisc_qstats_cpu_qlen_inc(q);
E
Eric Dumazet 已提交
109 110 111 112 113
	} else {
		qdisc_qstats_backlog_inc(q, skb);
		q->q.qlen++;
	}

114 115 116 117
	if (lock)
		spin_unlock(lock);
}

118
static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
119
{
120
	spinlock_t *lock = NULL;
121

122 123 124
	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
125
	}
126

127 128 129 130 131
	while (skb) {
		struct sk_buff *next = skb->next;

		__skb_queue_tail(&q->gso_skb, skb);

132 133 134 135
		/* it's still part of the queue */
		if (qdisc_is_percpu_stats(q)) {
			qdisc_qstats_cpu_requeues_inc(q);
			qdisc_qstats_cpu_backlog_inc(q, skb);
136
			qdisc_qstats_cpu_qlen_inc(q);
137 138 139 140 141
		} else {
			q->qstats.requeues++;
			qdisc_qstats_backlog_inc(q, skb);
			q->q.qlen++;
		}
142 143 144

		skb = next;
	}
145 146
	if (lock)
		spin_unlock(lock);
147 148 149
	__netif_schedule(q);
}

150 151
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
152 153
				 const struct netdev_queue *txq,
				 int *packets)
154
{
155
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
156 157

	while (bytelimit > 0) {
158
		struct sk_buff *nskb = q->dequeue(q);
159

160
		if (!nskb)
161 162
			break;

163 164 165
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
166
		(*packets)++; /* GSO counts as one pkt */
167
	}
168
	skb_mark_not_on_list(skb);
169 170
}

171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
187
			qdisc_enqueue_skb_bad_txq(q, nskb);
188 189 190 191 192 193
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
194
	skb_mark_not_on_list(skb);
195 196
}

197 198 199
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
200 201
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
202
{
203
	const struct netdev_queue *txq = q->dev_queue;
204
	struct sk_buff *skb = NULL;
205

206
	*packets = 1;
207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
		spinlock_t *lock = NULL;

		if (q->flags & TCQ_F_NOLOCK) {
			lock = qdisc_lock(q);
			spin_lock(lock);
		}

		skb = skb_peek(&q->gso_skb);

		/* skb may be null if another cpu pulls gso_skb off in between
		 * empty check and lock.
		 */
		if (!skb) {
			if (lock)
				spin_unlock(lock);
			goto validate;
		}

226 227
		/* skb in gso_skb were already validated */
		*validate = false;
228 229
		if (xfrm_offload(skb))
			*validate = true;
230
		/* check the reason of requeuing without tx lock first */
231
		txq = skb_get_tx_queue(txq->dev, skb);
232
		if (!netif_xmit_frozen_or_stopped(txq)) {
233 234 235
			skb = __skb_dequeue(&q->gso_skb);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
236
				qdisc_qstats_cpu_qlen_dec(q);
237 238 239 240 241
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
242
			skb = NULL;
243 244 245
		}
		if (lock)
			spin_unlock(lock);
246
		goto trace;
247
	}
248
validate:
249
	*validate = true;
250 251 252 253 254

	if ((q->flags & TCQ_F_ONETXQUEUE) &&
	    netif_xmit_frozen_or_stopped(txq))
		return skb;

255 256 257
	skb = qdisc_dequeue_skb_bad_txq(q);
	if (unlikely(skb))
		goto bulk;
258
	skb = q->dequeue(q);
259 260 261 262 263 264
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
265
	}
266 267
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
268 269 270
	return skb;
}

271
/*
272
 * Transmit possibly several skbs, and handle the return status as
273
 * required. Owning running seqcount bit guarantees that
274
 * only one CPU can execute this function.
275 276
 *
 * Returns to the caller:
277 278
 *				false  - hardware queue frozen backoff
 *				true   - feel free to send more pkts
279
 */
280 281 282
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		     struct net_device *dev, struct netdev_queue *txq,
		     spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
283
{
284
	int ret = NETDEV_TX_BUSY;
285
	bool again = false;
286 287

	/* And release qdisc */
288 289
	if (root_lock)
		spin_unlock(root_lock);
290

291 292
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
293 294 295 296 297 298 299 300 301 302 303
		skb = validate_xmit_skb_list(skb, dev, &again);

#ifdef CONFIG_XFRM_OFFLOAD
	if (unlikely(again)) {
		if (root_lock)
			spin_lock(root_lock);

		dev_requeue_skb(skb, q);
		return false;
	}
#endif
304

305
	if (likely(skb)) {
306 307 308
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
309

310
		HARD_TX_UNLOCK(dev, txq);
311
	} else {
312 313
		if (root_lock)
			spin_lock(root_lock);
314
		return true;
315
	}
316 317 318

	if (root_lock)
		spin_lock(root_lock);
319

320
	if (!dev_xmit_complete(ret)) {
321
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
322 323 324
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
325

326 327
		dev_requeue_skb(skb, q);
		return false;
328
	}
329

330
	return true;
L
Linus Torvalds 已提交
331 332
}

333 334 335
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
336
 * running seqcount guarantees only one CPU can process
337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
352
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
353
{
354
	spinlock_t *root_lock = NULL;
355 356 357
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
358
	bool validate;
359 360

	/* Dequeue packet */
361
	skb = dequeue_skb(q, &validate, packets);
362
	if (unlikely(!skb))
363
		return false;
364

365
	if (!(q->flags & TCQ_F_NOLOCK))
366 367
		root_lock = qdisc_lock(q);

368
	dev = qdisc_dev(q);
369
	txq = skb_get_tx_queue(dev, skb);
370

371
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
372 373
}

374
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
375
{
376
	int quota = dev_tx_weight;
377
	int packets;
378

379
	while (qdisc_restart(q, &packets)) {
380
		/*
J
jamal 已提交
381 382 383
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
384
		 */
385 386
		quota -= packets;
		if (quota <= 0 || need_resched()) {
387
			__netif_schedule(q);
388
			break;
389 390
		}
	}
H
Herbert Xu 已提交
391 392
}

393 394
unsigned long dev_trans_start(struct net_device *dev)
{
395
	unsigned long val, res;
396 397
	unsigned int i;

398 399
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
400 401
	else if (netif_is_macvlan(dev))
		dev = macvlan_dev_real_dev(dev);
F
Florian Westphal 已提交
402 403
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
404 405 406 407
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
408

409 410 411 412
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

413
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
414
{
415
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
416

H
Herbert Xu 已提交
417
	netif_tx_lock(dev);
418
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
419 420 421
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
422
			int some_queue_timedout = 0;
423
			unsigned int i;
424
			unsigned long trans_start;
425 426 427 428 429

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
430
				trans_start = txq->trans_start;
431
				if (netif_xmit_stopped(txq) &&
432 433 434
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
435
					txq->trans_timeout++;
436 437 438
					break;
				}
			}
439

440
			if (some_queue_timedout) {
441
				trace_net_dev_xmit_timeout(dev, i);
442
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
443
				       dev->name, netdev_drivername(dev), i);
444
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
445
			}
446 447 448
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
449 450 451
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
452
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
453 454 455 456 457 458

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
459
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
460 461
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
462 463
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
464 465 466 467 468 469 470 471 472 473 474
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
475
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
476
	if (del_timer(&dev->watchdog_timer))
477
		dev_put(dev);
H
Herbert Xu 已提交
478
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
479 480
}

481 482 483 484
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
485
 * Device has detected acquisition of carrier.
486
 */
487 488
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
489
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
490 491
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
492
		atomic_inc(&dev->carrier_up_count);
493
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
494 495 496
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
497
}
498
EXPORT_SYMBOL(netif_carrier_on);
499

500 501 502 503 504 505
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
506 507
void netif_carrier_off(struct net_device *dev)
{
508 509 510
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
511
		atomic_inc(&dev->carrier_down_count);
512
		linkwatch_fire_event(dev);
513
	}
514
}
515
EXPORT_SYMBOL(netif_carrier_off);
516

L
Linus Torvalds 已提交
517 518 519 520 521
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

522 523
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
524
{
525
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
526 527 528
	return NET_XMIT_CN;
}

529
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
530 531 532 533
{
	return NULL;
}

534
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
535 536 537 538
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
539
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
540 541 542
	.owner		=	THIS_MODULE,
};

543
static struct netdev_queue noop_netdev_queue = {
544
	RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
545
	.qdisc_sleeping	=	&noop_qdisc,
546 547
};

L
Linus Torvalds 已提交
548 549 550 551
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
552
	.ops		=	&noop_qdisc_ops,
553
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
554
	.dev_queue	=	&noop_netdev_queue,
555
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
556
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
557 558 559 560 561 562 563 564 565 566 567 568
	.gso_skb = {
		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
		.qlen = 0,
		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
	},
	.skb_bad_txq = {
		.next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
		.prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
		.qlen = 0,
		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
	},
L
Linus Torvalds 已提交
569
};
570
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
571

572 573
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
			struct netlink_ext_ack *extack)
P
Phil Sutter 已提交
574 575 576 577 578 579 580 581 582
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
583 584
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
585
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
586 587
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
588
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
589 590 591
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
592 593 594
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
595 596 597 598 599 600 601

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

602 603
/*
 * Private data for a pfifo_fast scheduler containing:
604
 *	- rings for priority bands
605 606
 */
struct pfifo_fast_priv {
607
	struct skb_array q[PFIFO_FAST_BANDS];
608 609
};

610 611
static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
					  int band)
612
{
613
	return &priv->q[band];
614 615
}

616 617
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
618
{
619 620 621
	int band = prio2band[skb->priority & TC_PRIO_MAX];
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	struct skb_array *q = band2list(priv, band);
E
Eric Dumazet 已提交
622
	unsigned int pkt_len = qdisc_pkt_len(skb);
623
	int err;
624

625 626 627 628 629
	err = skb_array_produce(q, skb);

	if (unlikely(err))
		return qdisc_drop_cpu(skb, qdisc, to_free);

630
	qdisc_update_stats_at_enqueue(qdisc, pkt_len);
631
	return NET_XMIT_SUCCESS;
L
Linus Torvalds 已提交
632 633
}

E
Eric Dumazet 已提交
634
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
635
{
636
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
637 638
	struct sk_buff *skb = NULL;
	int band;
639

640 641
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
642

643 644
		if (__skb_array_empty(q))
			continue;
645

646
		skb = __skb_array_consume(q);
647 648
	}
	if (likely(skb)) {
649
		qdisc_update_stats_at_dequeue(qdisc, skb);
650 651
	} else {
		qdisc->empty = true;
652
	}
653

654
	return skb;
L
Linus Torvalds 已提交
655 656
}

E
Eric Dumazet 已提交
657
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
658
{
659
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
660 661
	struct sk_buff *skb = NULL;
	int band;
662

663 664
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
665

666
		skb = __skb_array_peek(q);
667 668
	}

669
	return skb;
670 671
}

E
Eric Dumazet 已提交
672
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
673
{
674
	int i, band;
675
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
676

677 678 679
	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
		struct skb_array *q = band2list(priv, band);
		struct sk_buff *skb;
680

681 682 683 684 685 686
		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
		if (!q->ring.queue)
			continue;

687
		while ((skb = __skb_array_consume(q)) != NULL)
688 689 690
			kfree_skb(skb);
	}

691 692 693
	if (qdisc_is_percpu_stats(qdisc)) {
		for_each_possible_cpu(i) {
			struct gnet_stats_queue *q;
694

695 696 697 698
			q = per_cpu_ptr(qdisc->cpu_qstats, i);
			q->backlog = 0;
			q->qlen = 0;
		}
699
	}
L
Linus Torvalds 已提交
700 701
}

702 703 704 705
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
706
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
707 708
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
709 710 711 712 713 714
	return skb->len;

nla_put_failure:
	return -1;
}

715 716
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
			   struct netlink_ext_ack *extack)
717
{
718
	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
719
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
720 721 722 723 724
	int prio;

	/* guard against zero length rings */
	if (!qlen)
		return -EINVAL;
725

726 727 728 729 730 731 732 733
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);
		int err;

		err = skb_array_init(q, qlen, GFP_KERNEL);
		if (err)
			return -ENOMEM;
	}
734

735 736
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
737 738 739
	return 0;
}

740 741 742 743 744 745 746 747 748 749 750
static void pfifo_fast_destroy(struct Qdisc *sch)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
751
		if (!q->ring.queue)
752 753 754 755 756 757 758 759
			continue;
		/* Destroy ring but no need to kfree_skb because a call to
		 * pfifo_fast_reset() has already done that work.
		 */
		ptr_ring_cleanup(&q->ring, NULL);
	}
}

760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776
static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
					  unsigned int new_len)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	struct skb_array *bands[PFIFO_FAST_BANDS];
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		bands[prio] = q;
	}

	return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
					 GFP_KERNEL);
}

777
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
778
	.id		=	"pfifo_fast",
779
	.priv_size	=	sizeof(struct pfifo_fast_priv),
780 781
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
782
	.peek		=	pfifo_fast_peek,
783
	.init		=	pfifo_fast_init,
784
	.destroy	=	pfifo_fast_destroy,
785 786
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
787
	.change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
L
Linus Torvalds 已提交
788
	.owner		=	THIS_MODULE,
789
	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
L
Linus Torvalds 已提交
790
};
791
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
792

793
static struct lock_class_key qdisc_tx_busylock;
794
static struct lock_class_key qdisc_running_key;
795

796
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
797 798
			  const struct Qdisc_ops *ops,
			  struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
799 800 801
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
802
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
803
	int err = -ENOBUFS;
804 805 806
	struct net_device *dev;

	if (!dev_queue) {
807
		NL_SET_ERR_MSG(extack, "No device queue given");
808 809 810
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
811

812
	dev = dev_queue->dev;
813 814 815
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
816
	if (!p)
817 818
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
819 820 821 822 823 824 825 826 827 828
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
829
	__skb_queue_head_init(&sch->gso_skb);
830
	__skb_queue_head_init(&sch->skb_bad_txq);
831 832
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
833

834 835 836 837 838 839 840 841 842 843 844 845 846
	if (ops->static_flags & TCQ_F_CPUSTATS) {
		sch->cpu_bstats =
			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
		if (!sch->cpu_bstats)
			goto errout1;

		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
		if (!sch->cpu_qstats) {
			free_percpu(sch->cpu_bstats);
			goto errout1;
		}
	}

847
	spin_lock_init(&sch->busylock);
848 849 850
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

851 852 853 854 855
	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
	spin_lock_init(&sch->seqlock);
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

856 857 858 859
	seqcount_init(&sch->running);
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);

L
Linus Torvalds 已提交
860
	sch->ops = ops;
861
	sch->flags = ops->static_flags;
L
Linus Torvalds 已提交
862 863
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
864
	sch->dev_queue = dev_queue;
865
	sch->empty = true;
866
	dev_hold(dev);
867
	refcount_set(&sch->refcnt, 1);
868 869

	return sch;
870 871
errout1:
	kfree(p);
872
errout:
873
	return ERR_PTR(err);
874 875
}

876
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
877
				const struct Qdisc_ops *ops,
878 879
				unsigned int parentid,
				struct netlink_ext_ack *extack)
880 881
{
	struct Qdisc *sch;
882

883 884
	if (!try_module_get(ops->owner)) {
		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
885
		return NULL;
886
	}
887

888
	sch = qdisc_alloc(dev_queue, ops, extack);
889 890 891 892
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
893
	sch->parent = parentid;
894

895
	if (!ops->init || ops->init(sch, NULL, extack) == 0)
L
Linus Torvalds 已提交
896 897
		return sch;

898
	qdisc_put(sch);
L
Linus Torvalds 已提交
899 900
	return NULL;
}
901
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
902

903
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
904 905 906

void qdisc_reset(struct Qdisc *qdisc)
{
907
	const struct Qdisc_ops *ops = qdisc->ops;
908
	struct sk_buff *skb, *tmp;
L
Linus Torvalds 已提交
909 910 911

	if (ops->reset)
		ops->reset(qdisc);
912

913 914 915
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
916
	}
917

918 919 920 921 922
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

923
	qdisc->q.qlen = 0;
924
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
925
}
926
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
927

928
void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
929
{
930
	if (qdisc_is_percpu_stats(qdisc)) {
931
		free_percpu(qdisc->cpu_bstats);
932 933
		free_percpu(qdisc->cpu_qstats);
	}
934

E
Eric Dumazet 已提交
935 936 937
	kfree((char *) qdisc - qdisc->padded);
}

938
static void qdisc_free_cb(struct rcu_head *head)
V
Vlad Buslov 已提交
939 940 941 942 943 944
{
	struct Qdisc *q = container_of(head, struct Qdisc, rcu);

	qdisc_free(q);
}

945
static void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
946
{
947
	const struct Qdisc_ops  *ops = qdisc->ops;
948
	struct sk_buff *skb, *tmp;
949

950
#ifdef CONFIG_NET_SCHED
951
	qdisc_hash_del(qdisc);
952

E
Eric Dumazet 已提交
953
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
954
#endif
955
	gen_kill_estimator(&qdisc->rate_est);
956 957 958 959 960 961 962 963
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

964 965 966 967 968
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
	}

969 970 971 972 973
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

V
Vlad Buslov 已提交
974
	call_rcu(&qdisc->rcu, qdisc_free_cb);
L
Linus Torvalds 已提交
975
}
976 977 978 979 980 981 982 983 984 985

void qdisc_put(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_test(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
L
Linus Torvalds 已提交
986

V
Vlad Buslov 已提交
987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002
/* Version of qdisc_put() that is called with rtnl mutex unlocked.
 * Intended to be used as optimization, this function only takes rtnl lock if
 * qdisc reference counter reached zero.
 */

void qdisc_put_unlocked(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
	rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);

1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
1023
EXPORT_SYMBOL(dev_graft_qdisc);
1024

1025 1026 1027 1028
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
1029 1030
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
1031

1032 1033 1034
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;

1035
	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
1036 1037 1038
	if (!qdisc) {
		netdev_info(dev, "activation failed\n");
		return;
1039
	}
1040
	if (!netif_is_multiqueue(dev))
1041
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1042 1043 1044
	dev_queue->qdisc_sleeping = qdisc;
}

1045 1046 1047 1048 1049 1050 1051
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

1052 1053
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
1054 1055
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
1056
		qdisc_refcount_inc(dev->qdisc);
1057
	} else {
1058
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
1059 1060
		if (qdisc) {
			dev->qdisc = qdisc;
1061
			qdisc->ops->attach(qdisc);
1062 1063
		}
	}
1064
#ifdef CONFIG_NET_SCHED
1065
	if (dev->qdisc != &noop_qdisc)
1066
		qdisc_hash_add(dev->qdisc, false);
1067
#endif
1068 1069
}

1070 1071 1072 1073
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
1074
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
1075 1076
	int *need_watchdog_p = _need_watchdog;

1077 1078 1079
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

1080
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
1081
	if (need_watchdog_p) {
1082
		dev_queue->trans_start = 0;
1083
		*need_watchdog_p = 1;
1084
	}
1085 1086
}

L
Linus Torvalds 已提交
1087 1088
void dev_activate(struct net_device *dev)
{
1089
	int need_watchdog;
1090

L
Linus Torvalds 已提交
1091
	/* No queueing discipline is attached to device;
1092 1093
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
1094 1095
	 */

1096 1097
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
1098

1099 1100 1101 1102
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

1103 1104
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
1105 1106
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
1107 1108

	if (need_watchdog) {
1109
		netif_trans_update(dev);
L
Linus Torvalds 已提交
1110 1111
		dev_watchdog_up(dev);
	}
1112
}
1113
EXPORT_SYMBOL(dev_activate);
1114

1115 1116 1117
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
1118
{
1119
	struct Qdisc *qdisc_default = _qdisc_default;
1120 1121
	struct Qdisc *qdisc;

1122
	qdisc = rtnl_dereference(dev_queue->qdisc);
1123
	if (qdisc) {
1124 1125 1126 1127
		bool nolock = qdisc->flags & TCQ_F_NOLOCK;

		if (nolock)
			spin_lock_bh(&qdisc->seqlock);
1128 1129
		spin_lock_bh(qdisc_lock(qdisc));

1130 1131 1132
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

1133
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1134
		qdisc_reset(qdisc);
1135

1136
		spin_unlock_bh(qdisc_lock(qdisc));
1137 1138
		if (nolock)
			spin_unlock_bh(&qdisc->seqlock);
1139
	}
L
Linus Torvalds 已提交
1140 1141
}

1142
static bool some_qdisc_is_busy(struct net_device *dev)
1143 1144 1145 1146 1147
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
1148
		spinlock_t *root_lock;
1149
		struct Qdisc *q;
1150 1151 1152
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
1153
		q = dev_queue->qdisc_sleeping;
1154

1155 1156
		root_lock = qdisc_lock(q);
		spin_lock_bh(root_lock);
1157

1158 1159
		val = (qdisc_is_running(q) ||
		       test_bit(__QDISC_STATE_SCHED, &q->state));
1160

1161
		spin_unlock_bh(root_lock);
1162 1163 1164 1165 1166 1167 1168

		if (val)
			return true;
	}
	return false;
}

1169 1170 1171 1172 1173 1174 1175 1176 1177 1178
static void dev_qdisc_reset(struct net_device *dev,
			    struct netdev_queue *dev_queue,
			    void *none)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;

	if (qdisc)
		qdisc_reset(qdisc);
}

1179 1180 1181 1182 1183 1184 1185
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
1186
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
1187
{
1188
	struct net_device *dev;
1189

1190
	list_for_each_entry(dev, head, close_list) {
1191 1192 1193 1194 1195 1196 1197 1198
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
	}
L
Linus Torvalds 已提交
1199

1200 1201 1202 1203
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
1204
	synchronize_net();
L
Linus Torvalds 已提交
1205

1206
	/* Wait for outstanding qdisc_run calls. */
1207
	list_for_each_entry(dev, head, close_list) {
1208 1209
		while (some_qdisc_is_busy(dev))
			yield();
1210 1211 1212 1213 1214 1215 1216
		/* The new qdisc is assigned at this point so we can safely
		 * unwind stale skb lists and qdisc statistics
		 */
		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
		if (dev_ingress_queue(dev))
			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
	}
1217 1218 1219 1220 1221 1222
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

1223
	list_add(&dev->close_list, &single);
1224
	dev_deactivate_many(&single);
1225
	list_del(&single);
L
Linus Torvalds 已提交
1226
}
1227
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
1228

1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261
static int qdisc_change_tx_queue_len(struct net_device *dev,
				     struct netdev_queue *dev_queue)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
	const struct Qdisc_ops *ops = qdisc->ops;

	if (ops->change_tx_queue_len)
		return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
	return 0;
}

int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
	bool up = dev->flags & IFF_UP;
	unsigned int i;
	int ret = 0;

	if (up)
		dev_deactivate(dev);

	for (i = 0; i < dev->num_tx_queues; i++) {
		ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);

		/* TODO: revert changes on a partial failure */
		if (ret)
			break;
	}

	if (up)
		dev_activate(dev);
	return ret;
}

1262 1263
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
1264
				     void *_qdisc)
1265
{
1266 1267
	struct Qdisc *qdisc = _qdisc;

1268
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
1269 1270 1271
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
1272 1273
void dev_init_scheduler(struct net_device *dev)
{
1274
	dev->qdisc = &noop_qdisc;
1275
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
1276 1277
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
1278

1279
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
1280 1281
}

1282 1283 1284
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
1285
{
1286
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
1287
	struct Qdisc *qdisc_default = _qdisc_default;
1288 1289

	if (qdisc) {
1290
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1291
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
1292

1293
		qdisc_put(qdisc);
1294
	}
1295 1296 1297 1298
}

void dev_shutdown(struct net_device *dev)
{
1299
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1300 1301
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1302
	qdisc_put(dev->qdisc);
1303 1304
	dev->qdisc = &noop_qdisc;

1305
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
1306
}
1307

1308
void psched_ratecfg_precompute(struct psched_ratecfg *r,
1309 1310
			       const struct tc_ratespec *conf,
			       u64 rate64)
1311
{
1312 1313
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1314
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1315
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1316 1317
	r->mult = 1;
	/*
1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1329
	 */
1330 1331 1332 1333 1334 1335
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1336
				break;
1337 1338
			factor <<= 1;
			r->shift++;
1339 1340 1341 1342
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1343 1344 1345 1346 1347 1348 1349 1350

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
1351 1352 1353 1354 1355
	/* Protected with chain0->filter_chain_lock.
	 * Can't access chain directly because tp_head can be NULL.
	 */
	struct mini_Qdisc *miniq_old =
		rcu_dereference_protected(*miniqp->p_miniq, 1);
1356 1357 1358 1359
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
1360
		/* Wait for flying RCU callback before it is freed. */
1361
		rcu_barrier();
1362 1363 1364 1365 1366 1367 1368
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
1369
	 * we are about to modify. So wait until previous call_rcu callback
1370 1371
	 * is done.
	 */
1372
	rcu_barrier();
1373 1374 1375 1376
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
1377
		/* This is counterpart of the rcu barriers above. We need to
1378 1379 1380
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
1381
		call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func);
1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);