sch_generic.c 33.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
23
#include <linux/slab.h>
24
#include <linux/if_vlan.h>
25
#include <linux/skb_array.h>
26
#include <linux/if_macvlan.h>
27
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
28
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
29
#include <net/dst.h>
30
#include <trace/events/qdisc.h>
31
#include <trace/events/net.h>
32
#include <net/xfrm.h>
L
Linus Torvalds 已提交
33

34 35 36 37
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
38 39
/* Main transmission queue. */

40
/* Modifications to data participating in scheduling must be protected with
41
 * qdisc_lock(qdisc) spinlock.
42 43
 *
 * The idea is the following:
44 45
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
46
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
47
 */
48

E
Eric Dumazet 已提交
49 50
#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
	const struct netdev_queue *txq = q->dev_queue;
	spinlock_t *lock = NULL;
	struct sk_buff *skb;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	skb = skb_peek(&q->skb_bad_txq);
	if (skb) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			skb = __skb_dequeue(&q->skb_bad_txq);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
70
				qdisc_qstats_cpu_qlen_dec(q);
71 72 73 74 75
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
E
Eric Dumazet 已提交
76
			skb = SKB_XOFF_MAGIC;
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
		}
	}

	if (lock)
		spin_unlock(lock);

	return skb;
}

static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
{
	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);

	if (unlikely(skb))
		skb = __skb_dequeue_bad_txq(q);

	return skb;
}

static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
					     struct sk_buff *skb)
{
	spinlock_t *lock = NULL;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	__skb_queue_tail(&q->skb_bad_txq, skb);

E
Eric Dumazet 已提交
108 109
	if (qdisc_is_percpu_stats(q)) {
		qdisc_qstats_cpu_backlog_inc(q, skb);
110
		qdisc_qstats_cpu_qlen_inc(q);
E
Eric Dumazet 已提交
111 112 113 114 115
	} else {
		qdisc_qstats_backlog_inc(q, skb);
		q->q.qlen++;
	}

116 117 118 119
	if (lock)
		spin_unlock(lock);
}

120
static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
121
{
122
	spinlock_t *lock = NULL;
123

124 125 126
	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
127
	}
128

129 130 131 132 133
	while (skb) {
		struct sk_buff *next = skb->next;

		__skb_queue_tail(&q->gso_skb, skb);

134 135 136 137
		/* it's still part of the queue */
		if (qdisc_is_percpu_stats(q)) {
			qdisc_qstats_cpu_requeues_inc(q);
			qdisc_qstats_cpu_backlog_inc(q, skb);
138
			qdisc_qstats_cpu_qlen_inc(q);
139 140 141 142 143
		} else {
			q->qstats.requeues++;
			qdisc_qstats_backlog_inc(q, skb);
			q->q.qlen++;
		}
144 145 146

		skb = next;
	}
147 148
	if (lock)
		spin_unlock(lock);
149 150 151
	__netif_schedule(q);
}

152 153
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
154 155
				 const struct netdev_queue *txq,
				 int *packets)
156
{
157
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
158 159

	while (bytelimit > 0) {
160
		struct sk_buff *nskb = q->dequeue(q);
161

162
		if (!nskb)
163 164
			break;

165 166 167
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
168
		(*packets)++; /* GSO counts as one pkt */
169
	}
170
	skb_mark_not_on_list(skb);
171 172
}

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
189
			qdisc_enqueue_skb_bad_txq(q, nskb);
190 191 192 193 194 195
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
196
	skb_mark_not_on_list(skb);
197 198
}

199 200 201
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
202 203
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
204
{
205
	const struct netdev_queue *txq = q->dev_queue;
206
	struct sk_buff *skb = NULL;
207

208
	*packets = 1;
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
		spinlock_t *lock = NULL;

		if (q->flags & TCQ_F_NOLOCK) {
			lock = qdisc_lock(q);
			spin_lock(lock);
		}

		skb = skb_peek(&q->gso_skb);

		/* skb may be null if another cpu pulls gso_skb off in between
		 * empty check and lock.
		 */
		if (!skb) {
			if (lock)
				spin_unlock(lock);
			goto validate;
		}

228 229
		/* skb in gso_skb were already validated */
		*validate = false;
230 231
		if (xfrm_offload(skb))
			*validate = true;
232
		/* check the reason of requeuing without tx lock first */
233
		txq = skb_get_tx_queue(txq->dev, skb);
234
		if (!netif_xmit_frozen_or_stopped(txq)) {
235 236 237
			skb = __skb_dequeue(&q->gso_skb);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
238
				qdisc_qstats_cpu_qlen_dec(q);
239 240 241 242 243
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
244
			skb = NULL;
245 246 247
		}
		if (lock)
			spin_unlock(lock);
248
		goto trace;
249
	}
250
validate:
251
	*validate = true;
252 253 254 255 256

	if ((q->flags & TCQ_F_ONETXQUEUE) &&
	    netif_xmit_frozen_or_stopped(txq))
		return skb;

257
	skb = qdisc_dequeue_skb_bad_txq(q);
E
Eric Dumazet 已提交
258 259 260
	if (unlikely(skb)) {
		if (skb == SKB_XOFF_MAGIC)
			return NULL;
261
		goto bulk;
E
Eric Dumazet 已提交
262
	}
263
	skb = q->dequeue(q);
264 265 266 267 268 269
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
270
	}
271 272
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
273 274 275
	return skb;
}

276
/*
277
 * Transmit possibly several skbs, and handle the return status as
278
 * required. Owning running seqcount bit guarantees that
279
 * only one CPU can execute this function.
280 281
 *
 * Returns to the caller:
282 283
 *				false  - hardware queue frozen backoff
 *				true   - feel free to send more pkts
284
 */
285 286 287
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		     struct net_device *dev, struct netdev_queue *txq,
		     spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
288
{
289
	int ret = NETDEV_TX_BUSY;
290
	bool again = false;
291 292

	/* And release qdisc */
293 294
	if (root_lock)
		spin_unlock(root_lock);
295

296 297
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
298 299 300 301 302 303 304 305 306 307 308
		skb = validate_xmit_skb_list(skb, dev, &again);

#ifdef CONFIG_XFRM_OFFLOAD
	if (unlikely(again)) {
		if (root_lock)
			spin_lock(root_lock);

		dev_requeue_skb(skb, q);
		return false;
	}
#endif
309

310
	if (likely(skb)) {
311 312 313
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
314

315
		HARD_TX_UNLOCK(dev, txq);
316
	} else {
317 318
		if (root_lock)
			spin_lock(root_lock);
319
		return true;
320
	}
321 322 323

	if (root_lock)
		spin_lock(root_lock);
324

325
	if (!dev_xmit_complete(ret)) {
326
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
327 328 329
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
330

331 332
		dev_requeue_skb(skb, q);
		return false;
333
	}
334

335
	return true;
L
Linus Torvalds 已提交
336 337
}

338 339 340
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
341
 * running seqcount guarantees only one CPU can process
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
357
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
358
{
359
	spinlock_t *root_lock = NULL;
360 361 362
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
363
	bool validate;
364 365

	/* Dequeue packet */
366
	skb = dequeue_skb(q, &validate, packets);
367
	if (unlikely(!skb))
368
		return false;
369

370
	if (!(q->flags & TCQ_F_NOLOCK))
371 372
		root_lock = qdisc_lock(q);

373
	dev = qdisc_dev(q);
374
	txq = skb_get_tx_queue(dev, skb);
375

376
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
377 378
}

379
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
380
{
381
	int quota = dev_tx_weight;
382
	int packets;
383

384 385
	while (qdisc_restart(q, &packets)) {
		quota -= packets;
386
		if (quota <= 0) {
387
			__netif_schedule(q);
388
			break;
389 390
		}
	}
H
Herbert Xu 已提交
391 392
}

393 394
unsigned long dev_trans_start(struct net_device *dev)
{
395
	unsigned long val, res;
396 397
	unsigned int i;

398 399
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
400 401
	else if (netif_is_macvlan(dev))
		dev = macvlan_dev_real_dev(dev);
F
Florian Westphal 已提交
402 403
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
404 405 406 407
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
408

409 410 411 412
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

413
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
414
{
415
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
416

H
Herbert Xu 已提交
417
	netif_tx_lock(dev);
418
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
419 420 421
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
422
			int some_queue_timedout = 0;
423
			unsigned int i;
424
			unsigned long trans_start;
425 426 427 428 429

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
430
				trans_start = txq->trans_start;
431
				if (netif_xmit_stopped(txq) &&
432 433 434
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
435
					txq->trans_timeout++;
436 437 438
					break;
				}
			}
439

440
			if (some_queue_timedout) {
441
				trace_net_dev_xmit_timeout(dev, i);
442
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
443
				       dev->name, netdev_drivername(dev), i);
444
				dev->netdev_ops->ndo_tx_timeout(dev, i);
L
Linus Torvalds 已提交
445
			}
446 447 448
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
449 450 451
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
452
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
453 454 455 456 457 458

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
459
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
460 461
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
462 463
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
464 465 466 467 468 469 470 471 472 473 474
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
475
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
476
	if (del_timer(&dev->watchdog_timer))
477
		dev_put(dev);
H
Herbert Xu 已提交
478
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
479 480
}

481 482 483 484
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
485
 * Device has detected acquisition of carrier.
486
 */
487 488
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
489
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
490 491
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
492
		atomic_inc(&dev->carrier_up_count);
493
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
494 495 496
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
497
}
498
EXPORT_SYMBOL(netif_carrier_on);
499

500 501 502 503 504 505
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
506 507
void netif_carrier_off(struct net_device *dev)
{
508 509 510
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
511
		atomic_inc(&dev->carrier_down_count);
512
		linkwatch_fire_event(dev);
513
	}
514
}
515
EXPORT_SYMBOL(netif_carrier_off);
516

L
Linus Torvalds 已提交
517 518 519 520 521
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

522 523
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
524
{
525
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
526 527 528
	return NET_XMIT_CN;
}

529
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
530 531 532 533
{
	return NULL;
}

534
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
535 536 537 538
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
539
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
540 541 542
	.owner		=	THIS_MODULE,
};

543
static struct netdev_queue noop_netdev_queue = {
544
	RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
545
	.qdisc_sleeping	=	&noop_qdisc,
546 547
};

L
Linus Torvalds 已提交
548 549 550 551
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
552
	.ops		=	&noop_qdisc_ops,
553
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
554
	.dev_queue	=	&noop_netdev_queue,
555
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
556
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
557 558 559 560 561 562 563 564 565 566 567 568
	.gso_skb = {
		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
		.qlen = 0,
		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
	},
	.skb_bad_txq = {
		.next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
		.prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
		.qlen = 0,
		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
	},
L
Linus Torvalds 已提交
569
};
570
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
571

572 573
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
			struct netlink_ext_ack *extack)
P
Phil Sutter 已提交
574 575 576 577 578 579 580 581 582
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
583 584
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
585
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
586 587
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
588
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
589 590 591
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
592 593 594
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
595 596 597 598 599 600 601

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

602 603
/*
 * Private data for a pfifo_fast scheduler containing:
604
 *	- rings for priority bands
605 606
 */
struct pfifo_fast_priv {
607
	struct skb_array q[PFIFO_FAST_BANDS];
608 609
};

610 611
static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
					  int band)
612
{
613
	return &priv->q[band];
614 615
}

616 617
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
618
{
619 620 621
	int band = prio2band[skb->priority & TC_PRIO_MAX];
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	struct skb_array *q = band2list(priv, band);
E
Eric Dumazet 已提交
622
	unsigned int pkt_len = qdisc_pkt_len(skb);
623
	int err;
624

625 626
	err = skb_array_produce(q, skb);

627 628 629 630 631 632
	if (unlikely(err)) {
		if (qdisc_is_percpu_stats(qdisc))
			return qdisc_drop_cpu(skb, qdisc, to_free);
		else
			return qdisc_drop(skb, qdisc, to_free);
	}
633

634
	qdisc_update_stats_at_enqueue(qdisc, pkt_len);
635
	return NET_XMIT_SUCCESS;
L
Linus Torvalds 已提交
636 637
}

E
Eric Dumazet 已提交
638
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
639
{
640
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
641 642
	struct sk_buff *skb = NULL;
	int band;
643

644 645
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
646

647 648
		if (__skb_array_empty(q))
			continue;
649

650
		skb = __skb_array_consume(q);
651 652
	}
	if (likely(skb)) {
653
		qdisc_update_stats_at_dequeue(qdisc, skb);
654
	} else {
655
		WRITE_ONCE(qdisc->empty, true);
656
	}
657

658
	return skb;
L
Linus Torvalds 已提交
659 660
}

E
Eric Dumazet 已提交
661
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
662
{
663
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
664 665
	struct sk_buff *skb = NULL;
	int band;
666

667 668
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
669

670
		skb = __skb_array_peek(q);
671 672
	}

673
	return skb;
674 675
}

E
Eric Dumazet 已提交
676
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
677
{
678
	int i, band;
679
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
680

681 682 683
	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
		struct skb_array *q = band2list(priv, band);
		struct sk_buff *skb;
684

685 686 687 688 689 690
		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
		if (!q->ring.queue)
			continue;

691
		while ((skb = __skb_array_consume(q)) != NULL)
692 693 694
			kfree_skb(skb);
	}

695 696 697
	if (qdisc_is_percpu_stats(qdisc)) {
		for_each_possible_cpu(i) {
			struct gnet_stats_queue *q;
698

699 700 701 702
			q = per_cpu_ptr(qdisc->cpu_qstats, i);
			q->backlog = 0;
			q->qlen = 0;
		}
703
	}
L
Linus Torvalds 已提交
704 705
}

706 707 708 709
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
710
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
711 712
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
713 714 715 716 717 718
	return skb->len;

nla_put_failure:
	return -1;
}

719 720
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
			   struct netlink_ext_ack *extack)
721
{
722
	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
723
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
724 725 726 727 728
	int prio;

	/* guard against zero length rings */
	if (!qlen)
		return -EINVAL;
729

730 731 732 733 734 735 736 737
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);
		int err;

		err = skb_array_init(q, qlen, GFP_KERNEL);
		if (err)
			return -ENOMEM;
	}
738

739 740
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
741 742 743
	return 0;
}

744 745 746 747 748 749 750 751 752 753 754
static void pfifo_fast_destroy(struct Qdisc *sch)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
755
		if (!q->ring.queue)
756 757 758 759 760 761 762 763
			continue;
		/* Destroy ring but no need to kfree_skb because a call to
		 * pfifo_fast_reset() has already done that work.
		 */
		ptr_ring_cleanup(&q->ring, NULL);
	}
}

764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
					  unsigned int new_len)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	struct skb_array *bands[PFIFO_FAST_BANDS];
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		bands[prio] = q;
	}

	return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
					 GFP_KERNEL);
}

781
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
782
	.id		=	"pfifo_fast",
783
	.priv_size	=	sizeof(struct pfifo_fast_priv),
784 785
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
786
	.peek		=	pfifo_fast_peek,
787
	.init		=	pfifo_fast_init,
788
	.destroy	=	pfifo_fast_destroy,
789 790
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
791
	.change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
L
Linus Torvalds 已提交
792
	.owner		=	THIS_MODULE,
793
	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
L
Linus Torvalds 已提交
794
};
795
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
796

797 798 799
static struct lock_class_key qdisc_tx_busylock;
static struct lock_class_key qdisc_running_key;

800
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
801 802
			  const struct Qdisc_ops *ops,
			  struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
803 804 805
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
806
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
807
	int err = -ENOBUFS;
808 809 810
	struct net_device *dev;

	if (!dev_queue) {
811
		NL_SET_ERR_MSG(extack, "No device queue given");
812 813 814
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
815

816
	dev = dev_queue->dev;
817 818 819
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
820
	if (!p)
821 822
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
823 824 825 826 827 828 829 830 831 832
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
833
	__skb_queue_head_init(&sch->gso_skb);
834
	__skb_queue_head_init(&sch->skb_bad_txq);
835 836
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
837

838 839 840 841 842 843 844 845 846 847 848 849 850
	if (ops->static_flags & TCQ_F_CPUSTATS) {
		sch->cpu_bstats =
			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
		if (!sch->cpu_bstats)
			goto errout1;

		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
		if (!sch->cpu_qstats) {
			free_percpu(sch->cpu_bstats);
			goto errout1;
		}
	}

851
	spin_lock_init(&sch->busylock);
852 853 854
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

855 856
	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
	spin_lock_init(&sch->seqlock);
857 858 859
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

860
	seqcount_init(&sch->running);
861 862
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);
863

L
Linus Torvalds 已提交
864
	sch->ops = ops;
865
	sch->flags = ops->static_flags;
L
Linus Torvalds 已提交
866 867
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
868
	sch->dev_queue = dev_queue;
869
	sch->empty = true;
870
	dev_hold(dev);
871
	refcount_set(&sch->refcnt, 1);
872 873

	return sch;
874 875
errout1:
	kfree(p);
876
errout:
877
	return ERR_PTR(err);
878 879
}

880
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
881
				const struct Qdisc_ops *ops,
882 883
				unsigned int parentid,
				struct netlink_ext_ack *extack)
884 885
{
	struct Qdisc *sch;
886

887 888
	if (!try_module_get(ops->owner)) {
		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
889
		return NULL;
890
	}
891

892
	sch = qdisc_alloc(dev_queue, ops, extack);
893 894 895 896
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
897
	sch->parent = parentid;
898

899
	if (!ops->init || ops->init(sch, NULL, extack) == 0)
L
Linus Torvalds 已提交
900 901
		return sch;

902
	qdisc_put(sch);
L
Linus Torvalds 已提交
903 904
	return NULL;
}
905
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
906

907
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
908 909 910

void qdisc_reset(struct Qdisc *qdisc)
{
911
	const struct Qdisc_ops *ops = qdisc->ops;
912
	struct sk_buff *skb, *tmp;
L
Linus Torvalds 已提交
913 914 915

	if (ops->reset)
		ops->reset(qdisc);
916

917 918 919
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
920
	}
921

922 923 924 925 926
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

927
	qdisc->q.qlen = 0;
928
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
929
}
930
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
931

932
void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
933
{
934
	if (qdisc_is_percpu_stats(qdisc)) {
935
		free_percpu(qdisc->cpu_bstats);
936 937
		free_percpu(qdisc->cpu_qstats);
	}
938

E
Eric Dumazet 已提交
939 940 941
	kfree((char *) qdisc - qdisc->padded);
}

942
static void qdisc_free_cb(struct rcu_head *head)
V
Vlad Buslov 已提交
943 944 945 946 947 948
{
	struct Qdisc *q = container_of(head, struct Qdisc, rcu);

	qdisc_free(q);
}

949
static void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
950
{
951
	const struct Qdisc_ops  *ops = qdisc->ops;
952
	struct sk_buff *skb, *tmp;
953

954
#ifdef CONFIG_NET_SCHED
955
	qdisc_hash_del(qdisc);
956

E
Eric Dumazet 已提交
957
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
958
#endif
959
	gen_kill_estimator(&qdisc->rate_est);
960 961 962 963 964 965 966 967
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

968 969 970 971 972
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
	}

973 974 975 976 977
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

V
Vlad Buslov 已提交
978
	call_rcu(&qdisc->rcu, qdisc_free_cb);
L
Linus Torvalds 已提交
979
}
980 981 982

void qdisc_put(struct Qdisc *qdisc)
{
983 984 985
	if (!qdisc)
		return;

986 987 988 989 990 991 992
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_test(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
L
Linus Torvalds 已提交
993

V
Vlad Buslov 已提交
994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009
/* Version of qdisc_put() that is called with rtnl mutex unlocked.
 * Intended to be used as optimization, this function only takes rtnl lock if
 * qdisc reference counter reached zero.
 */

void qdisc_put_unlocked(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
	rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);

1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
1030
EXPORT_SYMBOL(dev_graft_qdisc);
1031

1032 1033 1034 1035
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
1036 1037
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
1038

1039 1040
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;
1041 1042
	else if(dev->type == ARPHRD_CAN)
		ops = &pfifo_fast_ops;
1043

1044
	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
1045
	if (!qdisc)
1046
		return;
1047

1048
	if (!netif_is_multiqueue(dev))
1049
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1050 1051 1052
	dev_queue->qdisc_sleeping = qdisc;
}

1053 1054 1055 1056 1057 1058 1059
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

1060 1061
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
1062 1063
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
1064
		qdisc_refcount_inc(dev->qdisc);
1065
	} else {
1066
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
1067 1068
		if (qdisc) {
			dev->qdisc = qdisc;
1069
			qdisc->ops->attach(qdisc);
1070 1071
		}
	}
1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083

	/* Detect default qdisc setup/init failed and fallback to "noqueue" */
	if (dev->qdisc == &noop_qdisc) {
		netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
			    default_qdisc_ops->id, noqueue_qdisc_ops.id);
		dev->priv_flags |= IFF_NO_QUEUE;
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		qdisc_refcount_inc(dev->qdisc);
		dev->priv_flags ^= IFF_NO_QUEUE;
	}

1084
#ifdef CONFIG_NET_SCHED
1085
	if (dev->qdisc != &noop_qdisc)
1086
		qdisc_hash_add(dev->qdisc, false);
1087
#endif
1088 1089
}

1090 1091 1092 1093
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
1094
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
1095 1096
	int *need_watchdog_p = _need_watchdog;

1097 1098 1099
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

1100
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
1101
	if (need_watchdog_p) {
1102
		dev_queue->trans_start = 0;
1103
		*need_watchdog_p = 1;
1104
	}
1105 1106
}

L
Linus Torvalds 已提交
1107 1108
void dev_activate(struct net_device *dev)
{
1109
	int need_watchdog;
1110

L
Linus Torvalds 已提交
1111
	/* No queueing discipline is attached to device;
1112 1113
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
1114 1115
	 */

1116 1117
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
1118

1119 1120 1121 1122
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

1123 1124
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
1125 1126
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
1127 1128

	if (need_watchdog) {
1129
		netif_trans_update(dev);
L
Linus Torvalds 已提交
1130 1131
		dev_watchdog_up(dev);
	}
1132
}
1133
EXPORT_SYMBOL(dev_activate);
1134

1135 1136 1137
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
1138
{
1139
	struct Qdisc *qdisc_default = _qdisc_default;
1140 1141
	struct Qdisc *qdisc;

1142
	qdisc = rtnl_dereference(dev_queue->qdisc);
1143
	if (qdisc) {
1144 1145 1146 1147
		bool nolock = qdisc->flags & TCQ_F_NOLOCK;

		if (nolock)
			spin_lock_bh(&qdisc->seqlock);
1148 1149
		spin_lock_bh(qdisc_lock(qdisc));

1150 1151 1152
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

1153
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1154
		qdisc_reset(qdisc);
1155

1156
		spin_unlock_bh(qdisc_lock(qdisc));
1157 1158
		if (nolock)
			spin_unlock_bh(&qdisc->seqlock);
1159
	}
L
Linus Torvalds 已提交
1160 1161
}

1162
static bool some_qdisc_is_busy(struct net_device *dev)
1163 1164 1165 1166 1167
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
1168
		spinlock_t *root_lock;
1169
		struct Qdisc *q;
1170 1171 1172
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
1173
		q = dev_queue->qdisc_sleeping;
1174

1175 1176
		root_lock = qdisc_lock(q);
		spin_lock_bh(root_lock);
1177

1178 1179
		val = (qdisc_is_running(q) ||
		       test_bit(__QDISC_STATE_SCHED, &q->state));
1180

1181
		spin_unlock_bh(root_lock);
1182 1183 1184 1185 1186 1187 1188

		if (val)
			return true;
	}
	return false;
}

1189 1190 1191 1192 1193 1194 1195 1196 1197 1198
static void dev_qdisc_reset(struct net_device *dev,
			    struct netdev_queue *dev_queue,
			    void *none)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;

	if (qdisc)
		qdisc_reset(qdisc);
}

1199 1200 1201 1202 1203 1204 1205
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
1206
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
1207
{
1208
	struct net_device *dev;
1209

1210
	list_for_each_entry(dev, head, close_list) {
1211 1212 1213 1214 1215 1216 1217 1218
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
	}
L
Linus Torvalds 已提交
1219

1220 1221 1222 1223
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
1224
	synchronize_net();
L
Linus Torvalds 已提交
1225

1226
	/* Wait for outstanding qdisc_run calls. */
1227
	list_for_each_entry(dev, head, close_list) {
1228 1229 1230 1231 1232 1233 1234
		while (some_qdisc_is_busy(dev)) {
			/* wait_event() would avoid this sleep-loop but would
			 * require expensive checks in the fast paths of packet
			 * processing which isn't worth it.
			 */
			schedule_timeout_uninterruptible(1);
		}
1235 1236 1237 1238 1239 1240 1241
		/* The new qdisc is assigned at this point so we can safely
		 * unwind stale skb lists and qdisc statistics
		 */
		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
		if (dev_ingress_queue(dev))
			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
	}
1242 1243 1244 1245 1246 1247
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

1248
	list_add(&dev->close_list, &single);
1249
	dev_deactivate_many(&single);
1250
	list_del(&single);
L
Linus Torvalds 已提交
1251
}
1252
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
1253

1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286
static int qdisc_change_tx_queue_len(struct net_device *dev,
				     struct netdev_queue *dev_queue)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
	const struct Qdisc_ops *ops = qdisc->ops;

	if (ops->change_tx_queue_len)
		return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
	return 0;
}

int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
	bool up = dev->flags & IFF_UP;
	unsigned int i;
	int ret = 0;

	if (up)
		dev_deactivate(dev);

	for (i = 0; i < dev->num_tx_queues; i++) {
		ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);

		/* TODO: revert changes on a partial failure */
		if (ret)
			break;
	}

	if (up)
		dev_activate(dev);
	return ret;
}

1287 1288
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
1289
				     void *_qdisc)
1290
{
1291 1292
	struct Qdisc *qdisc = _qdisc;

1293
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
1294 1295 1296
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
1297 1298
void dev_init_scheduler(struct net_device *dev)
{
1299
	dev->qdisc = &noop_qdisc;
1300
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
1301 1302
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
1303

1304
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
1305 1306
}

1307 1308 1309
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
1310
{
1311
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
1312
	struct Qdisc *qdisc_default = _qdisc_default;
1313 1314

	if (qdisc) {
1315
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1316
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
1317

1318
		qdisc_put(qdisc);
1319
	}
1320 1321 1322 1323
}

void dev_shutdown(struct net_device *dev)
{
1324
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1325 1326
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1327
	qdisc_put(dev->qdisc);
1328 1329
	dev->qdisc = &noop_qdisc;

1330
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
1331
}
1332

1333
void psched_ratecfg_precompute(struct psched_ratecfg *r,
1334 1335
			       const struct tc_ratespec *conf,
			       u64 rate64)
1336
{
1337 1338
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1339
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1340
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1341 1342
	r->mult = 1;
	/*
1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1354
	 */
1355 1356 1357 1358 1359 1360
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1361
				break;
1362 1363
			factor <<= 1;
			r->shift++;
1364 1365 1366 1367
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1368 1369 1370 1371 1372 1373 1374 1375

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
1376 1377 1378 1379 1380
	/* Protected with chain0->filter_chain_lock.
	 * Can't access chain directly because tp_head can be NULL.
	 */
	struct mini_Qdisc *miniq_old =
		rcu_dereference_protected(*miniqp->p_miniq, 1);
1381 1382 1383 1384
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
1385
		/* Wait for flying RCU callback before it is freed. */
1386
		rcu_barrier();
1387 1388 1389 1390 1391 1392 1393
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
1394
	 * we are about to modify. So wait until previous call_rcu callback
1395 1396
	 * is done.
	 */
1397
	rcu_barrier();
1398 1399 1400 1401
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
1402
		/* This is counterpart of the rcu barriers above. We need to
1403 1404 1405
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
1406
		call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func);
1407 1408 1409
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

1410 1411 1412 1413 1414 1415 1416 1417
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
				struct tcf_block *block)
{
	miniqp->miniq1.block = block;
	miniqp->miniq2.block = block;
}
EXPORT_SYMBOL(mini_qdisc_pair_block_init);

1418 1419 1420 1421 1422 1423 1424 1425 1426 1427
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);