sch_generic.c 33.2 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
23
#include <linux/slab.h>
24
#include <linux/if_vlan.h>
25
#include <linux/skb_array.h>
26
#include <linux/if_macvlan.h>
27
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
28
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
29
#include <net/dst.h>
30
#include <trace/events/qdisc.h>
31
#include <trace/events/net.h>
32
#include <net/xfrm.h>
L
Linus Torvalds 已提交
33

34 35 36 37
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
38 39
/* Main transmission queue. */

40
/* Modifications to data participating in scheduling must be protected with
41
 * qdisc_lock(qdisc) spinlock.
42 43
 *
 * The idea is the following:
44 45
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
46
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
47
 */
48

E
Eric Dumazet 已提交
49 50
#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)

51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69
static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
	const struct netdev_queue *txq = q->dev_queue;
	spinlock_t *lock = NULL;
	struct sk_buff *skb;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	skb = skb_peek(&q->skb_bad_txq);
	if (skb) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			skb = __skb_dequeue(&q->skb_bad_txq);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
70
				qdisc_qstats_cpu_qlen_dec(q);
71 72 73 74 75
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
E
Eric Dumazet 已提交
76
			skb = SKB_XOFF_MAGIC;
77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
		}
	}

	if (lock)
		spin_unlock(lock);

	return skb;
}

static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
{
	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);

	if (unlikely(skb))
		skb = __skb_dequeue_bad_txq(q);

	return skb;
}

static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
					     struct sk_buff *skb)
{
	spinlock_t *lock = NULL;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	__skb_queue_tail(&q->skb_bad_txq, skb);

E
Eric Dumazet 已提交
108 109
	if (qdisc_is_percpu_stats(q)) {
		qdisc_qstats_cpu_backlog_inc(q, skb);
110
		qdisc_qstats_cpu_qlen_inc(q);
E
Eric Dumazet 已提交
111 112 113 114 115
	} else {
		qdisc_qstats_backlog_inc(q, skb);
		q->q.qlen++;
	}

116 117 118 119
	if (lock)
		spin_unlock(lock);
}

120
static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
121
{
122
	spinlock_t *lock = NULL;
123

124 125 126
	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
127
	}
128

129 130 131 132 133
	while (skb) {
		struct sk_buff *next = skb->next;

		__skb_queue_tail(&q->gso_skb, skb);

134 135 136 137
		/* it's still part of the queue */
		if (qdisc_is_percpu_stats(q)) {
			qdisc_qstats_cpu_requeues_inc(q);
			qdisc_qstats_cpu_backlog_inc(q, skb);
138
			qdisc_qstats_cpu_qlen_inc(q);
139 140 141 142 143
		} else {
			q->qstats.requeues++;
			qdisc_qstats_backlog_inc(q, skb);
			q->q.qlen++;
		}
144 145 146

		skb = next;
	}
147 148
	if (lock)
		spin_unlock(lock);
149 150 151
	__netif_schedule(q);
}

152 153
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
154 155
				 const struct netdev_queue *txq,
				 int *packets)
156
{
157
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
158 159

	while (bytelimit > 0) {
160
		struct sk_buff *nskb = q->dequeue(q);
161

162
		if (!nskb)
163 164
			break;

165 166 167
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
168
		(*packets)++; /* GSO counts as one pkt */
169
	}
170
	skb_mark_not_on_list(skb);
171 172
}

173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
189
			qdisc_enqueue_skb_bad_txq(q, nskb);
190 191 192 193 194 195
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
196
	skb_mark_not_on_list(skb);
197 198
}

199 200 201
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
202 203
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
204
{
205
	const struct netdev_queue *txq = q->dev_queue;
206
	struct sk_buff *skb = NULL;
207

208
	*packets = 1;
209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227
	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
		spinlock_t *lock = NULL;

		if (q->flags & TCQ_F_NOLOCK) {
			lock = qdisc_lock(q);
			spin_lock(lock);
		}

		skb = skb_peek(&q->gso_skb);

		/* skb may be null if another cpu pulls gso_skb off in between
		 * empty check and lock.
		 */
		if (!skb) {
			if (lock)
				spin_unlock(lock);
			goto validate;
		}

228 229
		/* skb in gso_skb were already validated */
		*validate = false;
230 231
		if (xfrm_offload(skb))
			*validate = true;
232
		/* check the reason of requeuing without tx lock first */
233
		txq = skb_get_tx_queue(txq->dev, skb);
234
		if (!netif_xmit_frozen_or_stopped(txq)) {
235 236 237
			skb = __skb_dequeue(&q->gso_skb);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
238
				qdisc_qstats_cpu_qlen_dec(q);
239 240 241 242 243
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
244
			skb = NULL;
245 246 247
		}
		if (lock)
			spin_unlock(lock);
248
		goto trace;
249
	}
250
validate:
251
	*validate = true;
252 253 254 255 256

	if ((q->flags & TCQ_F_ONETXQUEUE) &&
	    netif_xmit_frozen_or_stopped(txq))
		return skb;

257
	skb = qdisc_dequeue_skb_bad_txq(q);
E
Eric Dumazet 已提交
258 259 260
	if (unlikely(skb)) {
		if (skb == SKB_XOFF_MAGIC)
			return NULL;
261
		goto bulk;
E
Eric Dumazet 已提交
262
	}
263
	skb = q->dequeue(q);
264 265 266 267 268 269
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
270
	}
271 272
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
273 274 275
	return skb;
}

276
/*
277
 * Transmit possibly several skbs, and handle the return status as
278
 * required. Owning running seqcount bit guarantees that
279
 * only one CPU can execute this function.
280 281
 *
 * Returns to the caller:
282 283
 *				false  - hardware queue frozen backoff
 *				true   - feel free to send more pkts
284
 */
285 286 287
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		     struct net_device *dev, struct netdev_queue *txq,
		     spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
288
{
289
	int ret = NETDEV_TX_BUSY;
290
	bool again = false;
291 292

	/* And release qdisc */
293 294
	if (root_lock)
		spin_unlock(root_lock);
295

296 297
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
298 299 300 301 302 303 304 305 306 307 308
		skb = validate_xmit_skb_list(skb, dev, &again);

#ifdef CONFIG_XFRM_OFFLOAD
	if (unlikely(again)) {
		if (root_lock)
			spin_lock(root_lock);

		dev_requeue_skb(skb, q);
		return false;
	}
#endif
309

310
	if (likely(skb)) {
311 312 313
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
314

315
		HARD_TX_UNLOCK(dev, txq);
316
	} else {
317 318
		if (root_lock)
			spin_lock(root_lock);
319
		return true;
320
	}
321 322 323

	if (root_lock)
		spin_lock(root_lock);
324

325
	if (!dev_xmit_complete(ret)) {
326
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
327 328 329
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
330

331 332
		dev_requeue_skb(skb, q);
		return false;
333
	}
334

335
	return true;
L
Linus Torvalds 已提交
336 337
}

338 339 340
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
341
 * running seqcount guarantees only one CPU can process
342 343 344 345 346 347 348 349 350 351 352 353 354 355 356
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
357
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
358
{
359
	spinlock_t *root_lock = NULL;
360 361 362
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
363
	bool validate;
364 365

	/* Dequeue packet */
366
	skb = dequeue_skb(q, &validate, packets);
367
	if (unlikely(!skb))
368
		return false;
369

370
	if (!(q->flags & TCQ_F_NOLOCK))
371 372
		root_lock = qdisc_lock(q);

373
	dev = qdisc_dev(q);
374
	txq = skb_get_tx_queue(dev, skb);
375

376
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
377 378
}

379
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
380
{
381
	int quota = dev_tx_weight;
382
	int packets;
383

384 385
	while (qdisc_restart(q, &packets)) {
		quota -= packets;
386
		if (quota <= 0) {
387
			__netif_schedule(q);
388
			break;
389 390
		}
	}
H
Herbert Xu 已提交
391 392
}

393 394
unsigned long dev_trans_start(struct net_device *dev)
{
395
	unsigned long val, res;
396 397
	unsigned int i;

398 399
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
400 401
	else if (netif_is_macvlan(dev))
		dev = macvlan_dev_real_dev(dev);
F
Florian Westphal 已提交
402 403
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
404 405 406 407
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
408

409 410 411 412
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

413
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
414
{
415
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
416

H
Herbert Xu 已提交
417
	netif_tx_lock(dev);
418
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
419 420 421
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
422
			int some_queue_timedout = 0;
423
			unsigned int i;
424
			unsigned long trans_start;
425 426 427 428 429

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
430
				trans_start = txq->trans_start;
431
				if (netif_xmit_stopped(txq) &&
432 433 434
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
435
					txq->trans_timeout++;
436 437 438
					break;
				}
			}
439

440
			if (some_queue_timedout) {
441
				trace_net_dev_xmit_timeout(dev, i);
442
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
443
				       dev->name, netdev_drivername(dev), i);
444
				dev->netdev_ops->ndo_tx_timeout(dev, i);
L
Linus Torvalds 已提交
445
			}
446 447 448
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
449 450 451
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
452
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
453 454 455 456 457 458

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
459
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
460 461
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
462 463
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
464 465 466 467 468 469 470 471 472 473 474
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
475
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
476
	if (del_timer(&dev->watchdog_timer))
477
		dev_put(dev);
H
Herbert Xu 已提交
478
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
479 480
}

481 482 483 484
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
485
 * Device has detected acquisition of carrier.
486
 */
487 488
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
489
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
490 491
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
492
		atomic_inc(&dev->carrier_up_count);
493
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
494 495 496
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
497
}
498
EXPORT_SYMBOL(netif_carrier_on);
499

500 501 502 503 504 505
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
506 507
void netif_carrier_off(struct net_device *dev)
{
508 509 510
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
511
		atomic_inc(&dev->carrier_down_count);
512
		linkwatch_fire_event(dev);
513
	}
514
}
515
EXPORT_SYMBOL(netif_carrier_off);
516

L
Linus Torvalds 已提交
517 518 519 520 521
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

522 523
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
524
{
525
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
526 527 528
	return NET_XMIT_CN;
}

529
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
530 531 532 533
{
	return NULL;
}

534
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
535 536 537 538
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
539
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
540 541 542
	.owner		=	THIS_MODULE,
};

543
static struct netdev_queue noop_netdev_queue = {
544
	RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
545
	.qdisc_sleeping	=	&noop_qdisc,
546 547
};

L
Linus Torvalds 已提交
548 549 550 551
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
552
	.ops		=	&noop_qdisc_ops,
553
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
554
	.dev_queue	=	&noop_netdev_queue,
555
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
556
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
557 558 559 560 561 562 563 564 565 566 567 568
	.gso_skb = {
		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
		.qlen = 0,
		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
	},
	.skb_bad_txq = {
		.next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
		.prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
		.qlen = 0,
		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
	},
L
Linus Torvalds 已提交
569
};
570
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
571

572 573
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
			struct netlink_ext_ack *extack)
P
Phil Sutter 已提交
574 575 576 577 578 579 580 581 582
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
583 584
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
585
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
586 587
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
588
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
589 590 591
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
592 593 594
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
595 596 597 598 599 600 601

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

602 603
/*
 * Private data for a pfifo_fast scheduler containing:
604
 *	- rings for priority bands
605 606
 */
struct pfifo_fast_priv {
607
	struct skb_array q[PFIFO_FAST_BANDS];
608 609
};

610 611
static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
					  int band)
612
{
613
	return &priv->q[band];
614 615
}

616 617
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
618
{
619 620 621
	int band = prio2band[skb->priority & TC_PRIO_MAX];
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	struct skb_array *q = band2list(priv, band);
E
Eric Dumazet 已提交
622
	unsigned int pkt_len = qdisc_pkt_len(skb);
623
	int err;
624

625 626
	err = skb_array_produce(q, skb);

627 628 629 630 631 632
	if (unlikely(err)) {
		if (qdisc_is_percpu_stats(qdisc))
			return qdisc_drop_cpu(skb, qdisc, to_free);
		else
			return qdisc_drop(skb, qdisc, to_free);
	}
633

634
	qdisc_update_stats_at_enqueue(qdisc, pkt_len);
635
	return NET_XMIT_SUCCESS;
L
Linus Torvalds 已提交
636 637
}

E
Eric Dumazet 已提交
638
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
639
{
640
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
641 642
	struct sk_buff *skb = NULL;
	int band;
643

644 645
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
646

647 648
		if (__skb_array_empty(q))
			continue;
649

650
		skb = __skb_array_consume(q);
651 652
	}
	if (likely(skb)) {
653
		qdisc_update_stats_at_dequeue(qdisc, skb);
654
	} else {
655
		WRITE_ONCE(qdisc->empty, true);
656
	}
657

658
	return skb;
L
Linus Torvalds 已提交
659 660
}

E
Eric Dumazet 已提交
661
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
662
{
663
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
664 665
	struct sk_buff *skb = NULL;
	int band;
666

667 668
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
669

670
		skb = __skb_array_peek(q);
671 672
	}

673
	return skb;
674 675
}

E
Eric Dumazet 已提交
676
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
677
{
678
	int i, band;
679
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
680

681 682 683
	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
		struct skb_array *q = band2list(priv, band);
		struct sk_buff *skb;
684

685 686 687 688 689 690
		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
		if (!q->ring.queue)
			continue;

691
		while ((skb = __skb_array_consume(q)) != NULL)
692 693 694
			kfree_skb(skb);
	}

695 696 697
	if (qdisc_is_percpu_stats(qdisc)) {
		for_each_possible_cpu(i) {
			struct gnet_stats_queue *q;
698

699 700 701 702
			q = per_cpu_ptr(qdisc->cpu_qstats, i);
			q->backlog = 0;
			q->qlen = 0;
		}
703
	}
L
Linus Torvalds 已提交
704 705
}

706 707 708 709
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
710
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
711 712
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
713 714 715 716 717 718
	return skb->len;

nla_put_failure:
	return -1;
}

719 720
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
			   struct netlink_ext_ack *extack)
721
{
722
	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
723
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
724 725 726 727 728
	int prio;

	/* guard against zero length rings */
	if (!qlen)
		return -EINVAL;
729

730 731 732 733 734 735 736 737
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);
		int err;

		err = skb_array_init(q, qlen, GFP_KERNEL);
		if (err)
			return -ENOMEM;
	}
738

739 740
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
741 742 743
	return 0;
}

744 745 746 747 748 749 750 751 752 753 754
static void pfifo_fast_destroy(struct Qdisc *sch)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
755
		if (!q->ring.queue)
756 757 758 759 760 761 762 763
			continue;
		/* Destroy ring but no need to kfree_skb because a call to
		 * pfifo_fast_reset() has already done that work.
		 */
		ptr_ring_cleanup(&q->ring, NULL);
	}
}

764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780
static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
					  unsigned int new_len)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	struct skb_array *bands[PFIFO_FAST_BANDS];
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		bands[prio] = q;
	}

	return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
					 GFP_KERNEL);
}

781
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
782
	.id		=	"pfifo_fast",
783
	.priv_size	=	sizeof(struct pfifo_fast_priv),
784 785
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
786
	.peek		=	pfifo_fast_peek,
787
	.init		=	pfifo_fast_init,
788
	.destroy	=	pfifo_fast_destroy,
789 790
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
791
	.change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
L
Linus Torvalds 已提交
792
	.owner		=	THIS_MODULE,
793
	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
L
Linus Torvalds 已提交
794
};
795
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
796

797 798 799
static struct lock_class_key qdisc_tx_busylock;
static struct lock_class_key qdisc_running_key;

800
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
801 802
			  const struct Qdisc_ops *ops,
			  struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
803 804 805
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
806
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
807
	int err = -ENOBUFS;
808 809 810
	struct net_device *dev;

	if (!dev_queue) {
811
		NL_SET_ERR_MSG(extack, "No device queue given");
812 813 814
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
815

816
	dev = dev_queue->dev;
817 818 819
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
820
	if (!p)
821 822
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
823 824 825 826 827 828 829 830 831 832
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
833
	__skb_queue_head_init(&sch->gso_skb);
834
	__skb_queue_head_init(&sch->skb_bad_txq);
835 836
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
837

838 839 840 841 842 843 844 845 846 847 848 849 850
	if (ops->static_flags & TCQ_F_CPUSTATS) {
		sch->cpu_bstats =
			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
		if (!sch->cpu_bstats)
			goto errout1;

		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
		if (!sch->cpu_qstats) {
			free_percpu(sch->cpu_bstats);
			goto errout1;
		}
	}

851
	spin_lock_init(&sch->busylock);
852 853 854
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

855 856
	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
	spin_lock_init(&sch->seqlock);
857 858 859
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

860
	seqcount_init(&sch->running);
861 862
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);
863

L
Linus Torvalds 已提交
864
	sch->ops = ops;
865
	sch->flags = ops->static_flags;
L
Linus Torvalds 已提交
866 867
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
868
	sch->dev_queue = dev_queue;
869
	sch->empty = true;
870
	dev_hold(dev);
871
	refcount_set(&sch->refcnt, 1);
872 873

	return sch;
874 875
errout1:
	kfree(p);
876
errout:
877
	return ERR_PTR(err);
878 879
}

880
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
881
				const struct Qdisc_ops *ops,
882 883
				unsigned int parentid,
				struct netlink_ext_ack *extack)
884 885
{
	struct Qdisc *sch;
886

887 888
	if (!try_module_get(ops->owner)) {
		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
889
		return NULL;
890
	}
891

892
	sch = qdisc_alloc(dev_queue, ops, extack);
893 894 895 896
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
897
	sch->parent = parentid;
898

899 900
	if (!ops->init || ops->init(sch, NULL, extack) == 0) {
		trace_qdisc_create(ops, dev_queue->dev, parentid);
L
Linus Torvalds 已提交
901
		return sch;
902
	}
L
Linus Torvalds 已提交
903

904
	qdisc_put(sch);
L
Linus Torvalds 已提交
905 906
	return NULL;
}
907
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
908

909
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
910 911 912

void qdisc_reset(struct Qdisc *qdisc)
{
913
	const struct Qdisc_ops *ops = qdisc->ops;
914
	struct sk_buff *skb, *tmp;
L
Linus Torvalds 已提交
915

916 917
	trace_qdisc_reset(qdisc);

L
Linus Torvalds 已提交
918 919
	if (ops->reset)
		ops->reset(qdisc);
920

921 922 923
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
924
	}
925

926 927 928 929 930
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

931
	qdisc->q.qlen = 0;
932
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
933
}
934
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
935

936
void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
937
{
938
	if (qdisc_is_percpu_stats(qdisc)) {
939
		free_percpu(qdisc->cpu_bstats);
940 941
		free_percpu(qdisc->cpu_qstats);
	}
942

E
Eric Dumazet 已提交
943 944 945
	kfree((char *) qdisc - qdisc->padded);
}

946
static void qdisc_free_cb(struct rcu_head *head)
V
Vlad Buslov 已提交
947 948 949 950 951 952
{
	struct Qdisc *q = container_of(head, struct Qdisc, rcu);

	qdisc_free(q);
}

953
static void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
954
{
955 956
	const struct Qdisc_ops  *ops = qdisc->ops;

957
#ifdef CONFIG_NET_SCHED
958
	qdisc_hash_del(qdisc);
959

E
Eric Dumazet 已提交
960
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
961
#endif
962
	gen_kill_estimator(&qdisc->rate_est);
963 964 965

	qdisc_reset(qdisc);

966 967 968 969 970 971
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

972 973
	trace_qdisc_destroy(qdisc);

V
Vlad Buslov 已提交
974
	call_rcu(&qdisc->rcu, qdisc_free_cb);
L
Linus Torvalds 已提交
975
}
976 977 978

void qdisc_put(struct Qdisc *qdisc)
{
979 980 981
	if (!qdisc)
		return;

982 983 984 985 986 987 988
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_test(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
L
Linus Torvalds 已提交
989

V
Vlad Buslov 已提交
990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005
/* Version of qdisc_put() that is called with rtnl mutex unlocked.
 * Intended to be used as optimization, this function only takes rtnl lock if
 * qdisc reference counter reached zero.
 */

void qdisc_put_unlocked(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
	rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);

1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
1026
EXPORT_SYMBOL(dev_graft_qdisc);
1027

1028 1029 1030 1031
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
1032 1033
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
1034

1035 1036
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;
1037 1038
	else if(dev->type == ARPHRD_CAN)
		ops = &pfifo_fast_ops;
1039

1040
	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
1041
	if (!qdisc)
1042
		return;
1043

1044
	if (!netif_is_multiqueue(dev))
1045
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1046 1047 1048
	dev_queue->qdisc_sleeping = qdisc;
}

1049 1050 1051 1052 1053 1054 1055
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

1056 1057
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
1058 1059
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
1060
		qdisc_refcount_inc(dev->qdisc);
1061
	} else {
1062
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
1063 1064
		if (qdisc) {
			dev->qdisc = qdisc;
1065
			qdisc->ops->attach(qdisc);
1066 1067
		}
	}
1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079

	/* Detect default qdisc setup/init failed and fallback to "noqueue" */
	if (dev->qdisc == &noop_qdisc) {
		netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
			    default_qdisc_ops->id, noqueue_qdisc_ops.id);
		dev->priv_flags |= IFF_NO_QUEUE;
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		qdisc_refcount_inc(dev->qdisc);
		dev->priv_flags ^= IFF_NO_QUEUE;
	}

1080
#ifdef CONFIG_NET_SCHED
1081
	if (dev->qdisc != &noop_qdisc)
1082
		qdisc_hash_add(dev->qdisc, false);
1083
#endif
1084 1085
}

1086 1087 1088 1089
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
1090
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
1091 1092
	int *need_watchdog_p = _need_watchdog;

1093 1094 1095
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

1096
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
1097
	if (need_watchdog_p) {
1098
		dev_queue->trans_start = 0;
1099
		*need_watchdog_p = 1;
1100
	}
1101 1102
}

L
Linus Torvalds 已提交
1103 1104
void dev_activate(struct net_device *dev)
{
1105
	int need_watchdog;
1106

L
Linus Torvalds 已提交
1107
	/* No queueing discipline is attached to device;
1108 1109
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
1110 1111
	 */

1112 1113
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
1114

1115 1116 1117 1118
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

1119 1120
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
1121 1122
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
1123 1124

	if (need_watchdog) {
1125
		netif_trans_update(dev);
L
Linus Torvalds 已提交
1126 1127
		dev_watchdog_up(dev);
	}
1128
}
1129
EXPORT_SYMBOL(dev_activate);
1130

1131 1132 1133
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
1134
{
1135
	struct Qdisc *qdisc_default = _qdisc_default;
1136 1137
	struct Qdisc *qdisc;

1138
	qdisc = rtnl_dereference(dev_queue->qdisc);
1139
	if (qdisc) {
1140 1141 1142 1143
		bool nolock = qdisc->flags & TCQ_F_NOLOCK;

		if (nolock)
			spin_lock_bh(&qdisc->seqlock);
1144 1145
		spin_lock_bh(qdisc_lock(qdisc));

1146 1147 1148
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

1149
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1150
		qdisc_reset(qdisc);
1151

1152
		spin_unlock_bh(qdisc_lock(qdisc));
1153 1154
		if (nolock)
			spin_unlock_bh(&qdisc->seqlock);
1155
	}
L
Linus Torvalds 已提交
1156 1157
}

1158
static bool some_qdisc_is_busy(struct net_device *dev)
1159 1160 1161 1162 1163
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
1164
		spinlock_t *root_lock;
1165
		struct Qdisc *q;
1166 1167 1168
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
1169
		q = dev_queue->qdisc_sleeping;
1170

1171 1172
		root_lock = qdisc_lock(q);
		spin_lock_bh(root_lock);
1173

1174 1175
		val = (qdisc_is_running(q) ||
		       test_bit(__QDISC_STATE_SCHED, &q->state));
1176

1177
		spin_unlock_bh(root_lock);
1178 1179 1180 1181 1182 1183 1184

		if (val)
			return true;
	}
	return false;
}

1185 1186 1187 1188 1189 1190 1191 1192 1193 1194
static void dev_qdisc_reset(struct net_device *dev,
			    struct netdev_queue *dev_queue,
			    void *none)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;

	if (qdisc)
		qdisc_reset(qdisc);
}

1195 1196 1197 1198 1199 1200 1201
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
1202
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
1203
{
1204
	struct net_device *dev;
1205

1206
	list_for_each_entry(dev, head, close_list) {
1207 1208 1209 1210 1211 1212 1213 1214
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
	}
L
Linus Torvalds 已提交
1215

1216 1217 1218 1219
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
1220
	synchronize_net();
L
Linus Torvalds 已提交
1221

1222
	/* Wait for outstanding qdisc_run calls. */
1223
	list_for_each_entry(dev, head, close_list) {
1224 1225 1226 1227 1228 1229 1230
		while (some_qdisc_is_busy(dev)) {
			/* wait_event() would avoid this sleep-loop but would
			 * require expensive checks in the fast paths of packet
			 * processing which isn't worth it.
			 */
			schedule_timeout_uninterruptible(1);
		}
1231 1232 1233 1234 1235 1236 1237
		/* The new qdisc is assigned at this point so we can safely
		 * unwind stale skb lists and qdisc statistics
		 */
		netdev_for_each_tx_queue(dev, dev_qdisc_reset, NULL);
		if (dev_ingress_queue(dev))
			dev_qdisc_reset(dev, dev_ingress_queue(dev), NULL);
	}
1238 1239 1240 1241 1242 1243
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

1244
	list_add(&dev->close_list, &single);
1245
	dev_deactivate_many(&single);
1246
	list_del(&single);
L
Linus Torvalds 已提交
1247
}
1248
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
1249

1250 1251 1252 1253 1254 1255 1256 1257 1258 1259 1260 1261 1262 1263 1264 1265 1266 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282
static int qdisc_change_tx_queue_len(struct net_device *dev,
				     struct netdev_queue *dev_queue)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
	const struct Qdisc_ops *ops = qdisc->ops;

	if (ops->change_tx_queue_len)
		return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
	return 0;
}

int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
	bool up = dev->flags & IFF_UP;
	unsigned int i;
	int ret = 0;

	if (up)
		dev_deactivate(dev);

	for (i = 0; i < dev->num_tx_queues; i++) {
		ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);

		/* TODO: revert changes on a partial failure */
		if (ret)
			break;
	}

	if (up)
		dev_activate(dev);
	return ret;
}

1283 1284
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
1285
				     void *_qdisc)
1286
{
1287 1288
	struct Qdisc *qdisc = _qdisc;

1289
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
1290 1291 1292
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
1293 1294
void dev_init_scheduler(struct net_device *dev)
{
1295
	dev->qdisc = &noop_qdisc;
1296
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
1297 1298
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
1299

1300
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
1301 1302
}

1303 1304 1305
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
1306
{
1307
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
1308
	struct Qdisc *qdisc_default = _qdisc_default;
1309 1310

	if (qdisc) {
1311
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1312
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
1313

1314
		qdisc_put(qdisc);
1315
	}
1316 1317 1318 1319
}

void dev_shutdown(struct net_device *dev)
{
1320
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1321 1322
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1323
	qdisc_put(dev->qdisc);
1324 1325
	dev->qdisc = &noop_qdisc;

1326
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
1327
}
1328

1329
void psched_ratecfg_precompute(struct psched_ratecfg *r,
1330 1331
			       const struct tc_ratespec *conf,
			       u64 rate64)
1332
{
1333 1334
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1335
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1336
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1337 1338
	r->mult = 1;
	/*
1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1350
	 */
1351 1352 1353 1354 1355 1356
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1357
				break;
1358 1359
			factor <<= 1;
			r->shift++;
1360 1361 1362 1363
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1364 1365 1366 1367 1368 1369 1370 1371

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
1372 1373 1374 1375 1376
	/* Protected with chain0->filter_chain_lock.
	 * Can't access chain directly because tp_head can be NULL.
	 */
	struct mini_Qdisc *miniq_old =
		rcu_dereference_protected(*miniqp->p_miniq, 1);
1377 1378 1379 1380
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
1381
		/* Wait for flying RCU callback before it is freed. */
1382
		rcu_barrier();
1383 1384 1385 1386 1387 1388 1389
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
1390
	 * we are about to modify. So wait until previous call_rcu callback
1391 1392
	 * is done.
	 */
1393
	rcu_barrier();
1394 1395 1396 1397
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
1398
		/* This is counterpart of the rcu barriers above. We need to
1399 1400 1401
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
1402
		call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func);
1403 1404 1405
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

1406 1407 1408 1409 1410 1411 1412 1413
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
				struct tcf_block *block)
{
	miniqp->miniq1.block = block;
	miniqp->miniq2.block = block;
}
EXPORT_SYMBOL(mini_qdisc_pair_block_init);

1414 1415 1416 1417 1418 1419 1420 1421 1422 1423
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);