sch_generic.c 36.6 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
L
Linus Torvalds 已提交
2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
23
#include <linux/slab.h>
24
#include <linux/if_vlan.h>
25
#include <linux/skb_array.h>
26
#include <linux/if_macvlan.h>
27
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
28
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
29
#include <net/dst.h>
30
#include <trace/events/qdisc.h>
31
#include <trace/events/net.h>
32
#include <net/xfrm.h>
L
Linus Torvalds 已提交
33

34 35 36 37
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54
static void qdisc_maybe_clear_missed(struct Qdisc *q,
				     const struct netdev_queue *txq)
{
	clear_bit(__QDISC_STATE_MISSED, &q->state);

	/* Make sure the below netif_xmit_frozen_or_stopped()
	 * checking happens after clearing STATE_MISSED.
	 */
	smp_mb__after_atomic();

	/* Checking netif_xmit_frozen_or_stopped() again to
	 * make sure STATE_MISSED is set if the STATE_MISSED
	 * set by netif_tx_wake_queue()'s rescheduling of
	 * net_tx_action() is cleared by the above clear_bit().
	 */
	if (!netif_xmit_frozen_or_stopped(txq))
		set_bit(__QDISC_STATE_MISSED, &q->state);
55 56
	else
		set_bit(__QDISC_STATE_DRAINING, &q->state);
57 58
}

L
Linus Torvalds 已提交
59 60
/* Main transmission queue. */

61
/* Modifications to data participating in scheduling must be protected with
62
 * qdisc_lock(qdisc) spinlock.
63 64
 *
 * The idea is the following:
65 66
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
67
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
68
 */
69

E
Eric Dumazet 已提交
70 71
#define SKB_XOFF_MAGIC ((struct sk_buff *)1UL)

72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90
static inline struct sk_buff *__skb_dequeue_bad_txq(struct Qdisc *q)
{
	const struct netdev_queue *txq = q->dev_queue;
	spinlock_t *lock = NULL;
	struct sk_buff *skb;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	skb = skb_peek(&q->skb_bad_txq);
	if (skb) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			skb = __skb_dequeue(&q->skb_bad_txq);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
91
				qdisc_qstats_cpu_qlen_dec(q);
92 93 94 95 96
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
E
Eric Dumazet 已提交
97
			skb = SKB_XOFF_MAGIC;
98
			qdisc_maybe_clear_missed(q, txq);
99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
		}
	}

	if (lock)
		spin_unlock(lock);

	return skb;
}

static inline struct sk_buff *qdisc_dequeue_skb_bad_txq(struct Qdisc *q)
{
	struct sk_buff *skb = skb_peek(&q->skb_bad_txq);

	if (unlikely(skb))
		skb = __skb_dequeue_bad_txq(q);

	return skb;
}

static inline void qdisc_enqueue_skb_bad_txq(struct Qdisc *q,
					     struct sk_buff *skb)
{
	spinlock_t *lock = NULL;

	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
	}

	__skb_queue_tail(&q->skb_bad_txq, skb);

E
Eric Dumazet 已提交
130 131
	if (qdisc_is_percpu_stats(q)) {
		qdisc_qstats_cpu_backlog_inc(q, skb);
132
		qdisc_qstats_cpu_qlen_inc(q);
E
Eric Dumazet 已提交
133 134 135 136 137
	} else {
		qdisc_qstats_backlog_inc(q, skb);
		q->q.qlen++;
	}

138 139 140 141
	if (lock)
		spin_unlock(lock);
}

142
static inline void dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
143
{
144
	spinlock_t *lock = NULL;
145

146 147 148
	if (q->flags & TCQ_F_NOLOCK) {
		lock = qdisc_lock(q);
		spin_lock(lock);
149
	}
150

151 152 153 154 155
	while (skb) {
		struct sk_buff *next = skb->next;

		__skb_queue_tail(&q->gso_skb, skb);

156 157 158 159
		/* it's still part of the queue */
		if (qdisc_is_percpu_stats(q)) {
			qdisc_qstats_cpu_requeues_inc(q);
			qdisc_qstats_cpu_backlog_inc(q, skb);
160
			qdisc_qstats_cpu_qlen_inc(q);
161 162 163 164 165
		} else {
			q->qstats.requeues++;
			qdisc_qstats_backlog_inc(q, skb);
			q->q.qlen++;
		}
166 167 168

		skb = next;
	}
169 170

	if (lock) {
171
		spin_unlock(lock);
172 173 174 175
		set_bit(__QDISC_STATE_MISSED, &q->state);
	} else {
		__netif_schedule(q);
	}
176 177
}

178 179
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
180 181
				 const struct netdev_queue *txq,
				 int *packets)
182
{
183
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
184 185

	while (bytelimit > 0) {
186
		struct sk_buff *nskb = q->dequeue(q);
187

188
		if (!nskb)
189 190
			break;

191 192 193
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
194
		(*packets)++; /* GSO counts as one pkt */
195
	}
196
	skb_mark_not_on_list(skb);
197 198
}

199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
215
			qdisc_enqueue_skb_bad_txq(q, nskb);
216 217 218 219 220 221
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
222
	skb_mark_not_on_list(skb);
223 224
}

225 226 227
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
228 229
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
230
{
231
	const struct netdev_queue *txq = q->dev_queue;
232
	struct sk_buff *skb = NULL;
233

234
	*packets = 1;
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253
	if (unlikely(!skb_queue_empty(&q->gso_skb))) {
		spinlock_t *lock = NULL;

		if (q->flags & TCQ_F_NOLOCK) {
			lock = qdisc_lock(q);
			spin_lock(lock);
		}

		skb = skb_peek(&q->gso_skb);

		/* skb may be null if another cpu pulls gso_skb off in between
		 * empty check and lock.
		 */
		if (!skb) {
			if (lock)
				spin_unlock(lock);
			goto validate;
		}

254 255
		/* skb in gso_skb were already validated */
		*validate = false;
256 257
		if (xfrm_offload(skb))
			*validate = true;
258
		/* check the reason of requeuing without tx lock first */
259
		txq = skb_get_tx_queue(txq->dev, skb);
260
		if (!netif_xmit_frozen_or_stopped(txq)) {
261 262 263
			skb = __skb_dequeue(&q->gso_skb);
			if (qdisc_is_percpu_stats(q)) {
				qdisc_qstats_cpu_backlog_dec(q, skb);
264
				qdisc_qstats_cpu_qlen_dec(q);
265 266 267 268 269
			} else {
				qdisc_qstats_backlog_dec(q, skb);
				q->q.qlen--;
			}
		} else {
270
			skb = NULL;
271
			qdisc_maybe_clear_missed(q, txq);
272 273 274
		}
		if (lock)
			spin_unlock(lock);
275
		goto trace;
276
	}
277
validate:
278
	*validate = true;
279 280

	if ((q->flags & TCQ_F_ONETXQUEUE) &&
281 282
	    netif_xmit_frozen_or_stopped(txq)) {
		qdisc_maybe_clear_missed(q, txq);
283
		return skb;
284
	}
285

286
	skb = qdisc_dequeue_skb_bad_txq(q);
E
Eric Dumazet 已提交
287 288 289
	if (unlikely(skb)) {
		if (skb == SKB_XOFF_MAGIC)
			return NULL;
290
		goto bulk;
E
Eric Dumazet 已提交
291
	}
292
	skb = q->dequeue(q);
293 294 295 296 297 298
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
299
	}
300 301
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
302 303 304
	return skb;
}

305
/*
306
 * Transmit possibly several skbs, and handle the return status as
307
 * required. Owning running seqcount bit guarantees that
308
 * only one CPU can execute this function.
309 310
 *
 * Returns to the caller:
311 312
 *				false  - hardware queue frozen backoff
 *				true   - feel free to send more pkts
313
 */
314 315 316
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		     struct net_device *dev, struct netdev_queue *txq,
		     spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
317
{
318
	int ret = NETDEV_TX_BUSY;
319
	bool again = false;
320 321

	/* And release qdisc */
322 323
	if (root_lock)
		spin_unlock(root_lock);
324

325 326
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
327 328 329 330 331 332 333 334 335 336 337
		skb = validate_xmit_skb_list(skb, dev, &again);

#ifdef CONFIG_XFRM_OFFLOAD
	if (unlikely(again)) {
		if (root_lock)
			spin_lock(root_lock);

		dev_requeue_skb(skb, q);
		return false;
	}
#endif
338

339
	if (likely(skb)) {
340 341 342
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
343 344
		else
			qdisc_maybe_clear_missed(q, txq);
345

346
		HARD_TX_UNLOCK(dev, txq);
347
	} else {
348 349
		if (root_lock)
			spin_lock(root_lock);
350
		return true;
351
	}
352 353 354

	if (root_lock)
		spin_lock(root_lock);
355

356
	if (!dev_xmit_complete(ret)) {
357
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
358 359 360
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
361

362 363
		dev_requeue_skb(skb, q);
		return false;
364
	}
365

366
	return true;
L
Linus Torvalds 已提交
367 368
}

369 370 371
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
372
 * running seqcount guarantees only one CPU can process
373 374 375 376 377 378 379 380 381 382 383 384 385 386 387
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
388
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
389
{
390
	spinlock_t *root_lock = NULL;
391 392 393
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
394
	bool validate;
395 396

	/* Dequeue packet */
397
	skb = dequeue_skb(q, &validate, packets);
398
	if (unlikely(!skb))
399
		return false;
400

401
	if (!(q->flags & TCQ_F_NOLOCK))
402 403
		root_lock = qdisc_lock(q);

404
	dev = qdisc_dev(q);
405
	txq = skb_get_tx_queue(dev, skb);
406

407
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
408 409
}

410
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
411
{
412
	int quota = dev_tx_weight;
413
	int packets;
414

415 416
	while (qdisc_restart(q, &packets)) {
		quota -= packets;
417
		if (quota <= 0) {
418 419 420 421 422
			if (q->flags & TCQ_F_NOLOCK)
				set_bit(__QDISC_STATE_MISSED, &q->state);
			else
				__netif_schedule(q);

423
			break;
424 425
		}
	}
H
Herbert Xu 已提交
426 427
}

428 429
unsigned long dev_trans_start(struct net_device *dev)
{
430
	unsigned long val, res;
431 432
	unsigned int i;

433 434
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
435 436
	else if (netif_is_macvlan(dev))
		dev = macvlan_dev_real_dev(dev);
F
Florian Westphal 已提交
437 438
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
439 440 441 442
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
443

444 445 446 447
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

448
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
449
{
450
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
451

H
Herbert Xu 已提交
452
	netif_tx_lock(dev);
453
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
454 455 456
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
457
			int some_queue_timedout = 0;
458
			unsigned int i;
459
			unsigned long trans_start;
460 461 462 463 464

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
465
				trans_start = txq->trans_start;
466
				if (netif_xmit_stopped(txq) &&
467 468 469
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
470
					txq->trans_timeout++;
471 472 473
					break;
				}
			}
474

475
			if (some_queue_timedout) {
476
				trace_net_dev_xmit_timeout(dev, i);
477
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
478
				       dev->name, netdev_drivername(dev), i);
479
				dev->netdev_ops->ndo_tx_timeout(dev, i);
L
Linus Torvalds 已提交
480
			}
481 482 483
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
484 485 486
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
487
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
488 489 490 491 492 493

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
494
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
495 496
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
497 498
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
499 500 501
			dev_hold(dev);
	}
}
502
EXPORT_SYMBOL_GPL(__netdev_watchdog_up);
L
Linus Torvalds 已提交
503 504 505 506 507 508 509 510

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
511
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
512
	if (del_timer(&dev->watchdog_timer))
513
		dev_put(dev);
H
Herbert Xu 已提交
514
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
515 516
}

517 518 519 520
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
521
 * Device has detected acquisition of carrier.
522
 */
523 524
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
525
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
526 527
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
528
		atomic_inc(&dev->carrier_up_count);
529
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
530 531 532
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
533
}
534
EXPORT_SYMBOL(netif_carrier_on);
535

536 537 538 539 540 541
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
542 543
void netif_carrier_off(struct net_device *dev)
{
544 545 546
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
547
		atomic_inc(&dev->carrier_down_count);
548
		linkwatch_fire_event(dev);
549
	}
550
}
551
EXPORT_SYMBOL(netif_carrier_off);
552

J
Jakub Kicinski 已提交
553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570
/**
 *	netif_carrier_event - report carrier state event
 *	@dev: network device
 *
 * Device has detected a carrier event but the carrier state wasn't changed.
 * Use in drivers when querying carrier state asynchronously, to avoid missing
 * events (link flaps) if link recovers before it's queried.
 */
void netif_carrier_event(struct net_device *dev)
{
	if (dev->reg_state == NETREG_UNINITIALIZED)
		return;
	atomic_inc(&dev->carrier_up_count);
	atomic_inc(&dev->carrier_down_count);
	linkwatch_fire_event(dev);
}
EXPORT_SYMBOL_GPL(netif_carrier_event);

L
Linus Torvalds 已提交
571 572 573 574 575
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

576 577
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
578
{
579
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
580 581 582
	return NET_XMIT_CN;
}

583
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
584 585 586 587
{
	return NULL;
}

588
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
589 590 591 592
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
593
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
594 595 596
	.owner		=	THIS_MODULE,
};

597
static struct netdev_queue noop_netdev_queue = {
598
	RCU_POINTER_INITIALIZER(qdisc, &noop_qdisc),
599
	.qdisc_sleeping	=	&noop_qdisc,
600 601
};

L
Linus Torvalds 已提交
602 603 604 605
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
606
	.ops		=	&noop_qdisc_ops,
607
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
608
	.dev_queue	=	&noop_netdev_queue,
609
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
610
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
611 612 613 614 615 616 617 618 619 620 621 622
	.gso_skb = {
		.next = (struct sk_buff *)&noop_qdisc.gso_skb,
		.prev = (struct sk_buff *)&noop_qdisc.gso_skb,
		.qlen = 0,
		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.gso_skb.lock),
	},
	.skb_bad_txq = {
		.next = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
		.prev = (struct sk_buff *)&noop_qdisc.skb_bad_txq,
		.qlen = 0,
		.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.skb_bad_txq.lock),
	},
L
Linus Torvalds 已提交
623
};
624
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
625

626 627
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt,
			struct netlink_ext_ack *extack)
P
Phil Sutter 已提交
628 629 630 631 632 633 634 635 636
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
637 638
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
639
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
640 641
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
642
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
643 644 645
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
646 647 648
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
649 650 651 652 653 654 655

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

656 657
/*
 * Private data for a pfifo_fast scheduler containing:
658
 *	- rings for priority bands
659 660
 */
struct pfifo_fast_priv {
661
	struct skb_array q[PFIFO_FAST_BANDS];
662 663
};

664 665
static inline struct skb_array *band2list(struct pfifo_fast_priv *priv,
					  int band)
666
{
667
	return &priv->q[band];
668 669
}

670 671
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
672
{
673 674 675
	int band = prio2band[skb->priority & TC_PRIO_MAX];
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	struct skb_array *q = band2list(priv, band);
E
Eric Dumazet 已提交
676
	unsigned int pkt_len = qdisc_pkt_len(skb);
677
	int err;
678

679 680
	err = skb_array_produce(q, skb);

681 682 683 684 685 686
	if (unlikely(err)) {
		if (qdisc_is_percpu_stats(qdisc))
			return qdisc_drop_cpu(skb, qdisc, to_free);
		else
			return qdisc_drop(skb, qdisc, to_free);
	}
687

688
	qdisc_update_stats_at_enqueue(qdisc, pkt_len);
689
	return NET_XMIT_SUCCESS;
L
Linus Torvalds 已提交
690 691
}

E
Eric Dumazet 已提交
692
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
693
{
694
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
695
	struct sk_buff *skb = NULL;
696
	bool need_retry = true;
697
	int band;
698

699
retry:
700 701
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
702

703 704
		if (__skb_array_empty(q))
			continue;
705

706
		skb = __skb_array_consume(q);
707 708
	}
	if (likely(skb)) {
709
		qdisc_update_stats_at_dequeue(qdisc, skb);
710
	} else if (need_retry &&
711
		   READ_ONCE(qdisc->state) & QDISC_STATE_NON_EMPTY) {
712 713 714 715 716 717
		/* Delay clearing the STATE_MISSED here to reduce
		 * the overhead of the second spin_trylock() in
		 * qdisc_run_begin() and __netif_schedule() calling
		 * in qdisc_run_end().
		 */
		clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
718
		clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
719 720 721 722 723 724 725 726 727

		/* Make sure dequeuing happens after clearing
		 * STATE_MISSED.
		 */
		smp_mb__after_atomic();

		need_retry = false;

		goto retry;
728
	}
729

730
	return skb;
L
Linus Torvalds 已提交
731 732
}

E
Eric Dumazet 已提交
733
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
734
{
735
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
736 737
	struct sk_buff *skb = NULL;
	int band;
738

739 740
	for (band = 0; band < PFIFO_FAST_BANDS && !skb; band++) {
		struct skb_array *q = band2list(priv, band);
741

742
		skb = __skb_array_peek(q);
743 744
	}

745
	return skb;
746 747
}

E
Eric Dumazet 已提交
748
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
749
{
750
	int i, band;
751
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
752

753 754 755
	for (band = 0; band < PFIFO_FAST_BANDS; band++) {
		struct skb_array *q = band2list(priv, band);
		struct sk_buff *skb;
756

757 758 759 760 761 762
		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
		if (!q->ring.queue)
			continue;

763
		while ((skb = __skb_array_consume(q)) != NULL)
764 765 766
			kfree_skb(skb);
	}

767 768 769
	if (qdisc_is_percpu_stats(qdisc)) {
		for_each_possible_cpu(i) {
			struct gnet_stats_queue *q;
770

771 772 773 774
			q = per_cpu_ptr(qdisc->cpu_qstats, i);
			q->backlog = 0;
			q->qlen = 0;
		}
775
	}
L
Linus Torvalds 已提交
776 777
}

778 779 780 781
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
782
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
783 784
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
785 786 787 788 789 790
	return skb->len;

nla_put_failure:
	return -1;
}

791 792
static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt,
			   struct netlink_ext_ack *extack)
793
{
794
	unsigned int qlen = qdisc_dev(qdisc)->tx_queue_len;
795
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
796 797 798 799 800
	int prio;

	/* guard against zero length rings */
	if (!qlen)
		return -EINVAL;
801

802 803 804 805 806 807 808 809
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);
		int err;

		err = skb_array_init(q, qlen, GFP_KERNEL);
		if (err)
			return -ENOMEM;
	}
810

811 812
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
813 814 815
	return 0;
}

816 817 818 819 820 821 822 823 824 825 826
static void pfifo_fast_destroy(struct Qdisc *sch)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		/* NULL ring is possible if destroy path is due to a failed
		 * skb_array_init() in pfifo_fast_init() case.
		 */
827
		if (!q->ring.queue)
828 829 830 831 832 833 834 835
			continue;
		/* Destroy ring but no need to kfree_skb because a call to
		 * pfifo_fast_reset() has already done that work.
		 */
		ptr_ring_cleanup(&q->ring, NULL);
	}
}

836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852
static int pfifo_fast_change_tx_queue_len(struct Qdisc *sch,
					  unsigned int new_len)
{
	struct pfifo_fast_priv *priv = qdisc_priv(sch);
	struct skb_array *bands[PFIFO_FAST_BANDS];
	int prio;

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		struct skb_array *q = band2list(priv, prio);

		bands[prio] = q;
	}

	return skb_array_resize_multiple(bands, PFIFO_FAST_BANDS, new_len,
					 GFP_KERNEL);
}

853
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
854
	.id		=	"pfifo_fast",
855
	.priv_size	=	sizeof(struct pfifo_fast_priv),
856 857
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
858
	.peek		=	pfifo_fast_peek,
859
	.init		=	pfifo_fast_init,
860
	.destroy	=	pfifo_fast_destroy,
861 862
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
863
	.change_tx_queue_len =  pfifo_fast_change_tx_queue_len,
L
Linus Torvalds 已提交
864
	.owner		=	THIS_MODULE,
865
	.static_flags	=	TCQ_F_NOLOCK | TCQ_F_CPUSTATS,
L
Linus Torvalds 已提交
866
};
867
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
868

869 870 871
static struct lock_class_key qdisc_tx_busylock;
static struct lock_class_key qdisc_running_key;

872
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
873 874
			  const struct Qdisc_ops *ops,
			  struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
875 876
{
	struct Qdisc *sch;
E
Eric Dumazet 已提交
877
	unsigned int size = sizeof(*sch) + ops->priv_size;
878
	int err = -ENOBUFS;
879 880 881
	struct net_device *dev;

	if (!dev_queue) {
882
		NL_SET_ERR_MSG(extack, "No device queue given");
883 884 885
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
886

887
	dev = dev_queue->dev;
E
Eric Dumazet 已提交
888
	sch = kzalloc_node(size, GFP_KERNEL, netdev_queue_numa_node_read(dev_queue));
889

E
Eric Dumazet 已提交
890
	if (!sch)
891
		goto errout;
892
	__skb_queue_head_init(&sch->gso_skb);
893
	__skb_queue_head_init(&sch->skb_bad_txq);
894
	qdisc_skb_head_init(&sch->q);
895
	gnet_stats_basic_sync_init(&sch->bstats);
896
	spin_lock_init(&sch->q.lock);
897

898 899
	if (ops->static_flags & TCQ_F_CPUSTATS) {
		sch->cpu_bstats =
900
			netdev_alloc_pcpu_stats(struct gnet_stats_basic_sync);
901 902 903 904 905 906 907 908 909 910
		if (!sch->cpu_bstats)
			goto errout1;

		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
		if (!sch->cpu_qstats) {
			free_percpu(sch->cpu_bstats);
			goto errout1;
		}
	}

911
	spin_lock_init(&sch->busylock);
912 913 914
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

915 916
	/* seqlock has the same scope of busylock, for NOLOCK qdisc */
	spin_lock_init(&sch->seqlock);
917
	lockdep_set_class(&sch->seqlock,
918 919
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

920
	seqcount_init(&sch->running);
921 922
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);
923

L
Linus Torvalds 已提交
924
	sch->ops = ops;
925
	sch->flags = ops->static_flags;
L
Linus Torvalds 已提交
926 927
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
928
	sch->dev_queue = dev_queue;
929
	dev_hold(dev);
930
	refcount_set(&sch->refcnt, 1);
931 932

	return sch;
933
errout1:
E
Eric Dumazet 已提交
934
	kfree(sch);
935
errout:
936
	return ERR_PTR(err);
937 938
}

939
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
940
				const struct Qdisc_ops *ops,
941 942
				unsigned int parentid,
				struct netlink_ext_ack *extack)
943 944
{
	struct Qdisc *sch;
945

946 947
	if (!try_module_get(ops->owner)) {
		NL_SET_ERR_MSG(extack, "Failed to increase module reference counter");
948
		return NULL;
949
	}
950

951
	sch = qdisc_alloc(dev_queue, ops, extack);
952 953 954 955
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
956
	sch->parent = parentid;
957

958 959
	if (!ops->init || ops->init(sch, NULL, extack) == 0) {
		trace_qdisc_create(ops, dev_queue->dev, parentid);
L
Linus Torvalds 已提交
960
		return sch;
961
	}
L
Linus Torvalds 已提交
962

963
	qdisc_put(sch);
L
Linus Torvalds 已提交
964 965
	return NULL;
}
966
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
967

968
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
969 970 971

void qdisc_reset(struct Qdisc *qdisc)
{
972
	const struct Qdisc_ops *ops = qdisc->ops;
973
	struct sk_buff *skb, *tmp;
L
Linus Torvalds 已提交
974

975 976
	trace_qdisc_reset(qdisc);

L
Linus Torvalds 已提交
977 978
	if (ops->reset)
		ops->reset(qdisc);
979

980 981 982
	skb_queue_walk_safe(&qdisc->gso_skb, skb, tmp) {
		__skb_unlink(skb, &qdisc->gso_skb);
		kfree_skb_list(skb);
983
	}
984

985 986 987 988 989
	skb_queue_walk_safe(&qdisc->skb_bad_txq, skb, tmp) {
		__skb_unlink(skb, &qdisc->skb_bad_txq);
		kfree_skb_list(skb);
	}

990
	qdisc->q.qlen = 0;
991
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
992
}
993
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
994

995
void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
996
{
997
	if (qdisc_is_percpu_stats(qdisc)) {
998
		free_percpu(qdisc->cpu_bstats);
999 1000
		free_percpu(qdisc->cpu_qstats);
	}
1001

E
Eric Dumazet 已提交
1002
	kfree(qdisc);
E
Eric Dumazet 已提交
1003 1004
}

1005
static void qdisc_free_cb(struct rcu_head *head)
V
Vlad Buslov 已提交
1006 1007 1008 1009 1010 1011
{
	struct Qdisc *q = container_of(head, struct Qdisc, rcu);

	qdisc_free(q);
}

1012
static void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
1013
{
1014 1015
	const struct Qdisc_ops  *ops = qdisc->ops;

1016
#ifdef CONFIG_NET_SCHED
1017
	qdisc_hash_del(qdisc);
1018

E
Eric Dumazet 已提交
1019
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
1020
#endif
1021
	gen_kill_estimator(&qdisc->rate_est);
1022 1023 1024

	qdisc_reset(qdisc);

1025 1026 1027 1028 1029 1030
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

1031 1032
	trace_qdisc_destroy(qdisc);

V
Vlad Buslov 已提交
1033
	call_rcu(&qdisc->rcu, qdisc_free_cb);
L
Linus Torvalds 已提交
1034
}
1035 1036 1037

void qdisc_put(struct Qdisc *qdisc)
{
1038 1039 1040
	if (!qdisc)
		return;

1041 1042 1043 1044 1045 1046 1047
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_test(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
}
EXPORT_SYMBOL(qdisc_put);
L
Linus Torvalds 已提交
1048

V
Vlad Buslov 已提交
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064
/* Version of qdisc_put() that is called with rtnl mutex unlocked.
 * Intended to be used as optimization, this function only takes rtnl lock if
 * qdisc reference counter reached zero.
 */

void qdisc_put_unlocked(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN ||
	    !refcount_dec_and_rtnl_lock(&qdisc->refcnt))
		return;

	qdisc_destroy(qdisc);
	rtnl_unlock();
}
EXPORT_SYMBOL(qdisc_put_unlocked);

1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
1085
EXPORT_SYMBOL(dev_graft_qdisc);
1086

1087 1088 1089 1090
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
1091 1092
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
1093

1094 1095
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;
1096 1097
	else if(dev->type == ARPHRD_CAN)
		ops = &pfifo_fast_ops;
1098

1099
	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT, NULL);
1100
	if (!qdisc)
1101
		return;
1102

1103
	if (!netif_is_multiqueue(dev))
1104
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
1105 1106 1107
	dev_queue->qdisc_sleeping = qdisc;
}

1108 1109 1110 1111 1112 1113 1114
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

1115 1116
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
1117 1118
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
1119
		qdisc_refcount_inc(dev->qdisc);
1120
	} else {
1121
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT, NULL);
1122 1123
		if (qdisc) {
			dev->qdisc = qdisc;
1124
			qdisc->ops->attach(qdisc);
1125 1126
		}
	}
1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138

	/* Detect default qdisc setup/init failed and fallback to "noqueue" */
	if (dev->qdisc == &noop_qdisc) {
		netdev_warn(dev, "default qdisc (%s) fail, fallback to %s\n",
			    default_qdisc_ops->id, noqueue_qdisc_ops.id);
		dev->priv_flags |= IFF_NO_QUEUE;
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
		qdisc_refcount_inc(dev->qdisc);
		dev->priv_flags ^= IFF_NO_QUEUE;
	}

1139
#ifdef CONFIG_NET_SCHED
1140
	if (dev->qdisc != &noop_qdisc)
1141
		qdisc_hash_add(dev->qdisc, false);
1142
#endif
1143 1144
}

1145 1146 1147 1148
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
1149
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
1150 1151
	int *need_watchdog_p = _need_watchdog;

1152 1153 1154
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

1155
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
1156
	if (need_watchdog_p) {
1157
		dev_queue->trans_start = 0;
1158
		*need_watchdog_p = 1;
1159
	}
1160 1161
}

L
Linus Torvalds 已提交
1162 1163
void dev_activate(struct net_device *dev)
{
1164
	int need_watchdog;
1165

L
Linus Torvalds 已提交
1166
	/* No queueing discipline is attached to device;
1167 1168
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
1169 1170
	 */

1171 1172
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
1173

1174 1175 1176 1177
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

1178 1179
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
1180 1181
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
1182 1183

	if (need_watchdog) {
1184
		netif_trans_update(dev);
L
Linus Torvalds 已提交
1185 1186
		dev_watchdog_up(dev);
	}
1187
}
1188
EXPORT_SYMBOL(dev_activate);
1189

1190 1191 1192 1193 1194 1195 1196 1197
static void qdisc_deactivate(struct Qdisc *qdisc)
{
	if (qdisc->flags & TCQ_F_BUILTIN)
		return;

	set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);
}

1198 1199 1200
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
1201
{
1202
	struct Qdisc *qdisc_default = _qdisc_default;
1203 1204
	struct Qdisc *qdisc;

1205
	qdisc = rtnl_dereference(dev_queue->qdisc);
1206
	if (qdisc) {
1207
		qdisc_deactivate(qdisc);
1208
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1209
	}
L
Linus Torvalds 已提交
1210 1211
}

1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231
static void dev_reset_queue(struct net_device *dev,
			    struct netdev_queue *dev_queue,
			    void *_unused)
{
	struct Qdisc *qdisc;
	bool nolock;

	qdisc = dev_queue->qdisc_sleeping;
	if (!qdisc)
		return;

	nolock = qdisc->flags & TCQ_F_NOLOCK;

	if (nolock)
		spin_lock_bh(&qdisc->seqlock);
	spin_lock_bh(qdisc_lock(qdisc));

	qdisc_reset(qdisc);

	spin_unlock_bh(qdisc_lock(qdisc));
1232 1233
	if (nolock) {
		clear_bit(__QDISC_STATE_MISSED, &qdisc->state);
1234
		clear_bit(__QDISC_STATE_DRAINING, &qdisc->state);
1235
		spin_unlock_bh(&qdisc->seqlock);
1236
	}
1237 1238
}

1239
static bool some_qdisc_is_busy(struct net_device *dev)
1240 1241 1242 1243 1244
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
1245
		spinlock_t *root_lock;
1246
		struct Qdisc *q;
1247 1248 1249
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
1250
		q = dev_queue->qdisc_sleeping;
1251

1252 1253
		root_lock = qdisc_lock(q);
		spin_lock_bh(root_lock);
1254

1255 1256
		val = (qdisc_is_running(q) ||
		       test_bit(__QDISC_STATE_SCHED, &q->state));
1257

1258
		spin_unlock_bh(root_lock);
1259 1260 1261 1262 1263 1264 1265

		if (val)
			return true;
	}
	return false;
}

1266 1267 1268 1269 1270 1271 1272
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
1273
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
1274
{
1275
	struct net_device *dev;
1276

1277
	list_for_each_entry(dev, head, close_list) {
1278 1279 1280 1281 1282 1283 1284 1285
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
	}
L
Linus Torvalds 已提交
1286

1287 1288
	/* Wait for outstanding qdisc-less dev_queue_xmit calls or
	 * outstanding qdisc enqueuing calls.
1289 1290 1291
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
1292
	synchronize_net();
L
Linus Torvalds 已提交
1293

1294 1295 1296 1297 1298 1299 1300
	list_for_each_entry(dev, head, close_list) {
		netdev_for_each_tx_queue(dev, dev_reset_queue, NULL);

		if (dev_ingress_queue(dev))
			dev_reset_queue(dev, dev_ingress_queue(dev), NULL);
	}

1301
	/* Wait for outstanding qdisc_run calls. */
1302
	list_for_each_entry(dev, head, close_list) {
1303 1304 1305 1306 1307 1308 1309
		while (some_qdisc_is_busy(dev)) {
			/* wait_event() would avoid this sleep-loop but would
			 * require expensive checks in the fast paths of packet
			 * processing which isn't worth it.
			 */
			schedule_timeout_uninterruptible(1);
		}
1310
	}
1311 1312 1313 1314 1315 1316
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

1317
	list_add(&dev->close_list, &single);
1318
	dev_deactivate_many(&single);
1319
	list_del(&single);
L
Linus Torvalds 已提交
1320
}
1321
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
1322

1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333
static int qdisc_change_tx_queue_len(struct net_device *dev,
				     struct netdev_queue *dev_queue)
{
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
	const struct Qdisc_ops *ops = qdisc->ops;

	if (ops->change_tx_queue_len)
		return ops->change_tx_queue_len(qdisc, dev->tx_queue_len);
	return 0;
}

1334 1335 1336 1337 1338 1339 1340 1341 1342
void dev_qdisc_change_real_num_tx(struct net_device *dev,
				  unsigned int new_real_tx)
{
	struct Qdisc *qdisc = dev->qdisc;

	if (qdisc->ops->change_real_num_tx)
		qdisc->ops->change_real_num_tx(qdisc, new_real_tx);
}

1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366
void mq_change_real_num_tx(struct Qdisc *sch, unsigned int new_real_tx)
{
#ifdef CONFIG_NET_SCHED
	struct net_device *dev = qdisc_dev(sch);
	struct Qdisc *qdisc;
	unsigned int i;

	for (i = new_real_tx; i < dev->real_num_tx_queues; i++) {
		qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
		/* Only update the default qdiscs we created,
		 * qdiscs with handles are always hashed.
		 */
		if (qdisc != &noop_qdisc && !qdisc->handle)
			qdisc_hash_del(qdisc);
	}
	for (i = dev->real_num_tx_queues; i < new_real_tx; i++) {
		qdisc = netdev_get_tx_queue(dev, i)->qdisc_sleeping;
		if (qdisc != &noop_qdisc && !qdisc->handle)
			qdisc_hash_add(qdisc, false);
	}
#endif
}
EXPORT_SYMBOL(mq_change_real_num_tx);

1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388
int dev_qdisc_change_tx_queue_len(struct net_device *dev)
{
	bool up = dev->flags & IFF_UP;
	unsigned int i;
	int ret = 0;

	if (up)
		dev_deactivate(dev);

	for (i = 0; i < dev->num_tx_queues; i++) {
		ret = qdisc_change_tx_queue_len(dev, &dev->_tx[i]);

		/* TODO: revert changes on a partial failure */
		if (ret)
			break;
	}

	if (up)
		dev_activate(dev);
	return ret;
}

1389 1390
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
1391
				     void *_qdisc)
1392
{
1393 1394
	struct Qdisc *qdisc = _qdisc;

1395
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
1396 1397 1398
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
1399 1400
void dev_init_scheduler(struct net_device *dev)
{
1401
	dev->qdisc = &noop_qdisc;
1402
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
1403 1404
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
1405

1406
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
1407 1408
}

1409 1410 1411
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
1412
{
1413
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
1414
	struct Qdisc *qdisc_default = _qdisc_default;
1415 1416

	if (qdisc) {
1417
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
1418
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
1419

1420
		qdisc_put(qdisc);
1421
	}
1422 1423 1424 1425
}

void dev_shutdown(struct net_device *dev)
{
1426
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1427 1428
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1429
	qdisc_put(dev->qdisc);
1430 1431
	dev->qdisc = &noop_qdisc;

1432
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
1433
}
1434

1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476
/**
 * psched_ratecfg_precompute__() - Pre-compute values for reciprocal division
 * @rate:   Rate to compute reciprocal division values of
 * @mult:   Multiplier for reciprocal division
 * @shift:  Shift for reciprocal division
 *
 * The multiplier and shift for reciprocal division by rate are stored
 * in mult and shift.
 *
 * The deal here is to replace a divide by a reciprocal one
 * in fast path (a reciprocal divide is a multiply and a shift)
 *
 * Normal formula would be :
 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
 *
 * We compute mult/shift to use instead :
 *  time_in_ns = (len * mult) >> shift;
 *
 * We try to get the highest possible mult value for accuracy,
 * but have to make sure no overflows will ever happen.
 *
 * reciprocal_value() is not used here it doesn't handle 64-bit values.
 */
static void psched_ratecfg_precompute__(u64 rate, u32 *mult, u8 *shift)
{
	u64 factor = NSEC_PER_SEC;

	*mult = 1;
	*shift = 0;

	if (rate <= 0)
		return;

	for (;;) {
		*mult = div64_u64(factor, rate);
		if (*mult & (1U << 31) || factor & (1ULL << 63))
			break;
		factor <<= 1;
		(*shift)++;
	}
}

1477
void psched_ratecfg_precompute(struct psched_ratecfg *r,
1478 1479
			       const struct tc_ratespec *conf,
			       u64 rate64)
1480
{
1481 1482
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1483
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1484
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1485
	psched_ratecfg_precompute__(r->rate_bytes_ps, &r->mult, &r->shift);
1486 1487
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1488

1489 1490 1491 1492 1493 1494 1495
void psched_ppscfg_precompute(struct psched_pktrate *r, u64 pktrate64)
{
	r->rate_pkts_ps = pktrate64;
	psched_ratecfg_precompute__(r->rate_pkts_ps, &r->mult, &r->shift);
}
EXPORT_SYMBOL(psched_ppscfg_precompute);

1496 1497 1498 1499 1500 1501 1502
static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
1503 1504 1505 1506 1507
	/* Protected with chain0->filter_chain_lock.
	 * Can't access chain directly because tp_head can be NULL.
	 */
	struct mini_Qdisc *miniq_old =
		rcu_dereference_protected(*miniqp->p_miniq, 1);
1508 1509 1510 1511
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
1512
		/* Wait for flying RCU callback before it is freed. */
1513
		rcu_barrier();
1514 1515 1516 1517 1518 1519 1520
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
1521
	 * we are about to modify. So wait until previous call_rcu callback
1522 1523
	 * is done.
	 */
1524
	rcu_barrier();
1525 1526 1527 1528
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
1529
		/* This is counterpart of the rcu barriers above. We need to
1530 1531 1532
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
1533
		call_rcu(&miniq_old->rcu, mini_qdisc_rcu_func);
1534 1535 1536
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

1537 1538 1539 1540 1541 1542 1543 1544
void mini_qdisc_pair_block_init(struct mini_Qdisc_pair *miniqp,
				struct tcf_block *block)
{
	miniqp->miniq1.block = block;
	miniqp->miniq2.block = block;
}
EXPORT_SYMBOL(mini_qdisc_pair_block_init);

1545 1546 1547 1548 1549 1550 1551 1552 1553 1554
void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);