sch_generic.c 25.6 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
30
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
31
#include <net/dst.h>
32
#include <trace/events/qdisc.h>
L
Linus Torvalds 已提交
33

34 35 36 37
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
38 39
/* Main transmission queue. */

40
/* Modifications to data participating in scheduling must be protected with
41
 * qdisc_lock(qdisc) spinlock.
42 43
 *
 * The idea is the following:
44 45
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
46
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
47 48
 */

49
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
50
{
51
	q->gso_skb = skb;
52
	q->qstats.requeues++;
53
	qdisc_qstats_backlog_inc(q, skb);
54
	q->q.qlen++;	/* it's still part of the queue */
55
	__netif_schedule(q);
56

57 58 59
	return 0;
}

60 61
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
62 63
				 const struct netdev_queue *txq,
				 int *packets)
64
{
65
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
66 67

	while (bytelimit > 0) {
68
		struct sk_buff *nskb = q->dequeue(q);
69

70
		if (!nskb)
71 72
			break;

73 74 75
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
76
		(*packets)++; /* GSO counts as one pkt */
77
	}
78
	skb->next = NULL;
79 80
}

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
			q->skb_bad_txq = nskb;
			qdisc_qstats_backlog_inc(q, nskb);
			q->q.qlen++;
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
	skb->next = NULL;
}

109 110 111
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
112 113
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
114
{
115
	struct sk_buff *skb = q->gso_skb;
116
	const struct netdev_queue *txq = q->dev_queue;
117

118
	*packets = 1;
119
	if (unlikely(skb)) {
120 121
		/* skb in gso_skb were already validated */
		*validate = false;
122
		/* check the reason of requeuing without tx lock first */
123
		txq = skb_get_tx_queue(txq->dev, skb);
124
		if (!netif_xmit_frozen_or_stopped(txq)) {
125
			q->gso_skb = NULL;
126
			qdisc_qstats_backlog_dec(q, skb);
127 128
			q->q.qlen--;
		} else
129
			skb = NULL;
130
		goto trace;
131 132 133 134 135 136 137 138 139 140 141
	}
	*validate = true;
	skb = q->skb_bad_txq;
	if (unlikely(skb)) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			q->skb_bad_txq = NULL;
			qdisc_qstats_backlog_dec(q, skb);
			q->q.qlen--;
			goto bulk;
142
		}
143 144
		skb = NULL;
		goto trace;
145 146 147 148 149 150 151 152 153 154
	}
	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
	    !netif_xmit_frozen_or_stopped(txq))
		skb = q->dequeue(q);
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
155
	}
156 157
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
158 159 160
	return skb;
}

161
/*
162
 * Transmit possibly several skbs, and handle the return status as
163
 * required. Owning running seqcount bit guarantees that
164
 * only one CPU can execute this function.
165 166
 *
 * Returns to the caller:
167 168
 *				false  - hardware queue frozen backoff
 *				true   - feel free to send more pkts
169
 */
170 171 172
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		     struct net_device *dev, struct netdev_queue *txq,
		     spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
173
{
174
	int ret = NETDEV_TX_BUSY;
175 176

	/* And release qdisc */
177 178
	if (root_lock)
		spin_unlock(root_lock);
179

180 181 182
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
		skb = validate_xmit_skb_list(skb, dev);
183

184
	if (likely(skb)) {
185 186 187
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
188

189
		HARD_TX_UNLOCK(dev, txq);
190
	} else {
191 192
		if (root_lock)
			spin_lock(root_lock);
193
		return true;
194
	}
195 196 197

	if (root_lock)
		spin_lock(root_lock);
198

199
	if (!dev_xmit_complete(ret)) {
200
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
201 202 203
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
204

205 206
		dev_requeue_skb(skb, q);
		return false;
207
	}
208

209
	if (ret && netif_xmit_frozen_or_stopped(txq))
210
		return false;
211

212
	return true;
L
Linus Torvalds 已提交
213 214
}

215 216 217
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
218
 * running seqcount guarantees only one CPU can process
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
234
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
235
{
236
	spinlock_t *root_lock = NULL;
237 238 239
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
240
	bool validate;
241 242

	/* Dequeue packet */
243
	skb = dequeue_skb(q, &validate, packets);
244
	if (unlikely(!skb))
245
		return false;
246

247 248 249
	if (!(q->flags & TCQ_F_NOLOCK))
		root_lock = qdisc_lock(q);

250
	dev = qdisc_dev(q);
251
	txq = skb_get_tx_queue(dev, skb);
252

253
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
254 255
}

256
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
257
{
258
	int quota = dev_tx_weight;
259
	int packets;
260

261
	while (qdisc_restart(q, &packets)) {
262
		/*
J
jamal 已提交
263 264 265
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
266
		 */
267 268
		quota -= packets;
		if (quota <= 0 || need_resched()) {
269
			__netif_schedule(q);
270
			break;
271 272
		}
	}
H
Herbert Xu 已提交
273 274
}

275 276
unsigned long dev_trans_start(struct net_device *dev)
{
277
	unsigned long val, res;
278 279
	unsigned int i;

280 281
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
F
Florian Westphal 已提交
282 283
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
284 285 286 287
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
288

289 290 291 292
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

293
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
294
{
295
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
296

H
Herbert Xu 已提交
297
	netif_tx_lock(dev);
298
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
299 300 301
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
302
			int some_queue_timedout = 0;
303
			unsigned int i;
304
			unsigned long trans_start;
305 306 307 308 309

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
310
				trans_start = txq->trans_start;
311
				if (netif_xmit_stopped(txq) &&
312 313 314
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
315
					txq->trans_timeout++;
316 317 318
					break;
				}
			}
319

320 321
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
322
				       dev->name, netdev_drivername(dev), i);
323
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
324
			}
325 326 327
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
328 329 330
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
331
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
332 333 334 335 336 337

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
338
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
339 340
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
341 342
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
343 344 345 346 347 348 349 350 351 352 353
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
354
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
355
	if (del_timer(&dev->watchdog_timer))
356
		dev_put(dev);
H
Herbert Xu 已提交
357
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
358 359
}

360 361 362 363 364 365
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
366 367
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
368
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
369 370
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
371
		atomic_inc(&dev->carrier_changes);
372
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
373 374 375
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
376
}
377
EXPORT_SYMBOL(netif_carrier_on);
378

379 380 381 382 383 384
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
385 386
void netif_carrier_off(struct net_device *dev)
{
387 388 389
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
390
		atomic_inc(&dev->carrier_changes);
391
		linkwatch_fire_event(dev);
392
	}
393
}
394
EXPORT_SYMBOL(netif_carrier_off);
395

L
Linus Torvalds 已提交
396 397 398 399 400
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

401 402
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
403
{
404
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
405 406 407
	return NET_XMIT_CN;
}

408
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
409 410 411 412
{
	return NULL;
}

413
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
414 415 416 417
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
418
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
419 420 421
	.owner		=	THIS_MODULE,
};

422 423
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
424
	.qdisc_sleeping	=	&noop_qdisc,
425 426
};

L
Linus Torvalds 已提交
427 428 429 430
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
431
	.ops		=	&noop_qdisc_ops,
432
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
433
	.dev_queue	=	&noop_netdev_queue,
434
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
435
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
436
};
437
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
438

P
Phil Sutter 已提交
439 440 441 442 443 444 445 446 447 448
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
449 450
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
451
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
452 453
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
454
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
455 456 457
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
458 459 460
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
461 462 463 464 465 466 467

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

468 469 470 471 472 473 474
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
475
	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
476 477 478 479 480 481 482 483 484 485
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

486
static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
487
					     int band)
488
{
489
	return priv->q + band;
490 491
}

492 493
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
494
{
495
	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
496 497
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
498
		struct qdisc_skb_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
499

500
		priv->bitmap |= (1 << band);
501
		qdisc->q.qlen++;
502
		return __qdisc_enqueue_tail(skb, qdisc, list);
503
	}
504

505
	return qdisc_drop(skb, qdisc, to_free);
L
Linus Torvalds 已提交
506 507
}

E
Eric Dumazet 已提交
508
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
509
{
510 511
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
512

513
	if (likely(band >= 0)) {
514 515
		struct qdisc_skb_head *qh = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qh);
516 517 518 519 520

		if (likely(skb != NULL)) {
			qdisc_qstats_backlog_dec(qdisc, skb);
			qdisc_bstats_update(qdisc, skb);
		}
521 522

		qdisc->q.qlen--;
523
		if (qh->qlen == 0)
524 525 526
			priv->bitmap &= ~(1 << band);

		return skb;
527
	}
528

L
Linus Torvalds 已提交
529 530 531
	return NULL;
}

E
Eric Dumazet 已提交
532
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
533
{
534 535 536 537
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
538
		struct qdisc_skb_head *qh = band2list(priv, band);
539

540
		return qh->head;
541 542 543 544 545
	}

	return NULL;
}

E
Eric Dumazet 已提交
546
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
547
{
548
	int prio;
549
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
550 551

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
552
		__qdisc_reset_queue(band2list(priv, prio));
553

554
	priv->bitmap = 0;
555
	qdisc->qstats.backlog = 0;
556
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
557 558
}

559 560 561 562
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
563
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
564 565
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
566 567 568 569 570 571 572 573 574
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
575
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
576 577

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
578
		qdisc_skb_head_init(band2list(priv, prio));
579

580 581
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
582 583 584
	return 0;
}

585
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
586
	.id		=	"pfifo_fast",
587
	.priv_size	=	sizeof(struct pfifo_fast_priv),
588 589
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
590
	.peek		=	pfifo_fast_peek,
591 592 593
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
594 595
	.owner		=	THIS_MODULE,
};
596
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
597

598
static struct lock_class_key qdisc_tx_busylock;
599
static struct lock_class_key qdisc_running_key;
600

601
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
602
			  const struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
603 604 605
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
606
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
607
	int err = -ENOBUFS;
608 609 610 611 612 613
	struct net_device *dev;

	if (!dev_queue) {
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
614

615
	dev = dev_queue->dev;
616 617 618
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
619
	if (!p)
620 621
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
622 623 624 625 626 627 628 629 630 631
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
632 633
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
634

635
	spin_lock_init(&sch->busylock);
636 637 638
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

639 640 641 642
	seqcount_init(&sch->running);
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);

L
Linus Torvalds 已提交
643 644 645
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
646
	sch->dev_queue = dev_queue;
647
	dev_hold(dev);
648
	refcount_set(&sch->refcnt, 1);
649 650 651

	return sch;
errout:
652
	return ERR_PTR(err);
653 654
}

655
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
656 657
				const struct Qdisc_ops *ops,
				unsigned int parentid)
658 659
{
	struct Qdisc *sch;
660

661
	if (!try_module_get(ops->owner))
662
		return NULL;
663

664
	sch = qdisc_alloc(dev_queue, ops);
665 666 667 668
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
669
	sch->parent = parentid;
670

L
Linus Torvalds 已提交
671 672 673
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

674
	qdisc_destroy(sch);
L
Linus Torvalds 已提交
675 676
	return NULL;
}
677
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
678

679
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
680 681 682

void qdisc_reset(struct Qdisc *qdisc)
{
683
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
684 685 686

	if (ops->reset)
		ops->reset(qdisc);
687

688 689 690
	kfree_skb(qdisc->skb_bad_txq);
	qdisc->skb_bad_txq = NULL;

691
	if (qdisc->gso_skb) {
692
		kfree_skb_list(qdisc->gso_skb);
693 694
		qdisc->gso_skb = NULL;
	}
695
	qdisc->q.qlen = 0;
696
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
697
}
698
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
699

700
static void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
701
{
702
	if (qdisc_is_percpu_stats(qdisc)) {
703
		free_percpu(qdisc->cpu_bstats);
704 705
		free_percpu(qdisc->cpu_qstats);
	}
706

E
Eric Dumazet 已提交
707 708 709
	kfree((char *) qdisc - qdisc->padded);
}

710
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
711
{
712 713
	const struct Qdisc_ops  *ops = qdisc->ops;

714
	if (qdisc->flags & TCQ_F_BUILTIN ||
715
	    !refcount_dec_and_test(&qdisc->refcnt))
716 717
		return;

718
#ifdef CONFIG_NET_SCHED
719
	qdisc_hash_del(qdisc);
720

E
Eric Dumazet 已提交
721
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
722
#endif
723
	gen_kill_estimator(&qdisc->rate_est);
724 725 726 727 728 729 730 731
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

732
	kfree_skb_list(qdisc->gso_skb);
733
	kfree_skb(qdisc->skb_bad_txq);
734
	qdisc_free(qdisc);
L
Linus Torvalds 已提交
735
}
736
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
737

738 739 740 741 742 743 744 745 746 747 748
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
749
	if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1)
750 751 752 753 754 755 756 757 758 759 760 761
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
762
EXPORT_SYMBOL(dev_graft_qdisc);
763

764 765 766 767
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
768 769
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
770

771 772 773 774 775 776 777
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;

	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT);
	if (!qdisc) {
		netdev_info(dev, "activation failed\n");
		return;
778
	}
779
	if (!netif_is_multiqueue(dev))
780
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
781 782 783
	dev_queue->qdisc_sleeping = qdisc;
}

784 785 786 787 788 789 790
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

791 792
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
793 794
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
795
		qdisc_refcount_inc(dev->qdisc);
796
	} else {
797
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
798 799
		if (qdisc) {
			dev->qdisc = qdisc;
800
			qdisc->ops->attach(qdisc);
801 802
		}
	}
803
#ifdef CONFIG_NET_SCHED
804
	if (dev->qdisc != &noop_qdisc)
805
		qdisc_hash_add(dev->qdisc, false);
806
#endif
807 808
}

809 810 811 812
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
813
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
814 815
	int *need_watchdog_p = _need_watchdog;

816 817 818
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

819
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
820
	if (need_watchdog_p) {
821
		dev_queue->trans_start = 0;
822
		*need_watchdog_p = 1;
823
	}
824 825
}

L
Linus Torvalds 已提交
826 827
void dev_activate(struct net_device *dev)
{
828
	int need_watchdog;
829

L
Linus Torvalds 已提交
830
	/* No queueing discipline is attached to device;
831 832
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
833 834
	 */

835 836
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
837

838 839 840 841
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

842 843
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
844 845
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
846 847

	if (need_watchdog) {
848
		netif_trans_update(dev);
L
Linus Torvalds 已提交
849 850
		dev_watchdog_up(dev);
	}
851
}
852
EXPORT_SYMBOL(dev_activate);
853

854 855 856
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
857
{
858
	struct Qdisc *qdisc_default = _qdisc_default;
859 860
	struct Qdisc *qdisc;

861
	qdisc = rtnl_dereference(dev_queue->qdisc);
862
	if (qdisc) {
863 864
		spin_lock_bh(qdisc_lock(qdisc));

865 866 867
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

868
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
869
		qdisc_reset(qdisc);
870

871
		spin_unlock_bh(qdisc_lock(qdisc));
872
	}
L
Linus Torvalds 已提交
873 874
}

875
static bool some_qdisc_is_busy(struct net_device *dev)
876 877 878 879 880
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
881
		spinlock_t *root_lock;
882
		struct Qdisc *q;
883 884 885
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
886
		q = dev_queue->qdisc_sleeping;
887

888 889 890 891 892
		if (q->flags & TCQ_F_NOLOCK) {
			val = test_bit(__QDISC_STATE_SCHED, &q->state);
		} else {
			root_lock = qdisc_lock(q);
			spin_lock_bh(root_lock);
893

894 895
			val = (qdisc_is_running(q) ||
			       test_bit(__QDISC_STATE_SCHED, &q->state));
896

897 898
			spin_unlock_bh(root_lock);
		}
899 900 901 902 903 904 905

		if (val)
			return true;
	}
	return false;
}

906 907 908 909 910 911 912
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
913
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
914
{
915
	struct net_device *dev;
916
	bool sync_needed = false;
917

918
	list_for_each_entry(dev, head, close_list) {
919 920 921 922 923 924 925
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
926
		sync_needed |= !dev->dismantle;
927
	}
L
Linus Torvalds 已提交
928

929 930 931 932 933 934
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
	if (sync_needed)
		synchronize_net();
L
Linus Torvalds 已提交
935

936
	/* Wait for outstanding qdisc_run calls. */
937
	list_for_each_entry(dev, head, close_list)
938 939 940 941 942 943 944 945
		while (some_qdisc_is_busy(dev))
			yield();
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

946
	list_add(&dev->close_list, &single);
947
	dev_deactivate_many(&single);
948
	list_del(&single);
L
Linus Torvalds 已提交
949
}
950
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
951

952 953
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
954
				     void *_qdisc)
955
{
956 957
	struct Qdisc *qdisc = _qdisc;

958
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
959 960 961
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
962 963
void dev_init_scheduler(struct net_device *dev)
{
964
	dev->qdisc = &noop_qdisc;
965
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
966 967
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
968

969
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
970 971
}

972 973 974
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
975
{
976
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
977
	struct Qdisc *qdisc_default = _qdisc_default;
978 979

	if (qdisc) {
980
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
981
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
982 983

		qdisc_destroy(qdisc);
984
	}
985 986 987 988
}

void dev_shutdown(struct net_device *dev)
{
989
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
990 991
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
992 993 994
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

995
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
996
}
997

998
void psched_ratecfg_precompute(struct psched_ratecfg *r,
999 1000
			       const struct tc_ratespec *conf,
			       u64 rate64)
1001
{
1002 1003
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1004
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1005
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1006 1007
	r->mult = 1;
	/*
1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1019
	 */
1020 1021 1022 1023 1024 1025
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1026
				break;
1027 1028
			factor <<= 1;
			r->shift++;
1029 1030 1031 1032
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
	struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
	 * we are about to modify. So wait until previous call_rcu_bh callback
	 * is done.
	 */
	rcu_barrier_bh();
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
		/* This is counterpart of the rcu barrier above. We need to
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
		call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);