sch_generic.c 26.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
27
#include <linux/slab.h>
28
#include <linux/if_vlan.h>
29
#include <net/sch_generic.h>
L
Linus Torvalds 已提交
30
#include <net/pkt_sched.h>
E
Eric Dumazet 已提交
31
#include <net/dst.h>
32
#include <trace/events/qdisc.h>
L
Linus Torvalds 已提交
33

34 35 36 37
/* Qdisc to use by default */
const struct Qdisc_ops *default_qdisc_ops = &pfifo_fast_ops;
EXPORT_SYMBOL(default_qdisc_ops);

L
Linus Torvalds 已提交
38 39
/* Main transmission queue. */

40
/* Modifications to data participating in scheduling must be protected with
41
 * qdisc_lock(qdisc) spinlock.
42 43
 *
 * The idea is the following:
44 45
 * - enqueue, dequeue are serialized via qdisc root lock
 * - ingress filtering is also serialized via qdisc root lock
46
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
47 48
 */

49
static inline int dev_requeue_skb(struct sk_buff *skb, struct Qdisc *q)
50
{
51
	q->gso_skb = skb;
52
	q->qstats.requeues++;
53
	qdisc_qstats_backlog_inc(q, skb);
54
	q->q.qlen++;	/* it's still part of the queue */
55
	__netif_schedule(q);
56

57 58 59
	return 0;
}

60 61
static void try_bulk_dequeue_skb(struct Qdisc *q,
				 struct sk_buff *skb,
62 63
				 const struct netdev_queue *txq,
				 int *packets)
64
{
65
	int bytelimit = qdisc_avail_bulklimit(txq) - skb->len;
66 67

	while (bytelimit > 0) {
68
		struct sk_buff *nskb = q->dequeue(q);
69

70
		if (!nskb)
71 72
			break;

73 74 75
		bytelimit -= nskb->len; /* covers GSO len */
		skb->next = nskb;
		skb = nskb;
76
		(*packets)++; /* GSO counts as one pkt */
77
	}
78
	skb->next = NULL;
79 80
}

81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108
/* This variant of try_bulk_dequeue_skb() makes sure
 * all skbs in the chain are for the same txq
 */
static void try_bulk_dequeue_skb_slow(struct Qdisc *q,
				      struct sk_buff *skb,
				      int *packets)
{
	int mapping = skb_get_queue_mapping(skb);
	struct sk_buff *nskb;
	int cnt = 0;

	do {
		nskb = q->dequeue(q);
		if (!nskb)
			break;
		if (unlikely(skb_get_queue_mapping(nskb) != mapping)) {
			q->skb_bad_txq = nskb;
			qdisc_qstats_backlog_inc(q, nskb);
			q->q.qlen++;
			break;
		}
		skb->next = nskb;
		skb = nskb;
	} while (++cnt < 8);
	(*packets) += cnt;
	skb->next = NULL;
}

109 110 111
/* Note that dequeue_skb can possibly return a SKB list (via skb->next).
 * A requeued skb (via q->gso_skb) can also be a SKB list.
 */
112 113
static struct sk_buff *dequeue_skb(struct Qdisc *q, bool *validate,
				   int *packets)
114
{
115
	struct sk_buff *skb = q->gso_skb;
116
	const struct netdev_queue *txq = q->dev_queue;
117

118
	*packets = 1;
119
	if (unlikely(skb)) {
120 121
		/* skb in gso_skb were already validated */
		*validate = false;
122
		/* check the reason of requeuing without tx lock first */
123
		txq = skb_get_tx_queue(txq->dev, skb);
124
		if (!netif_xmit_frozen_or_stopped(txq)) {
125
			q->gso_skb = NULL;
126
			qdisc_qstats_backlog_dec(q, skb);
127 128
			q->q.qlen--;
		} else
129
			skb = NULL;
130
		goto trace;
131 132 133 134 135 136 137 138 139 140 141
	}
	*validate = true;
	skb = q->skb_bad_txq;
	if (unlikely(skb)) {
		/* check the reason of requeuing without tx lock first */
		txq = skb_get_tx_queue(txq->dev, skb);
		if (!netif_xmit_frozen_or_stopped(txq)) {
			q->skb_bad_txq = NULL;
			qdisc_qstats_backlog_dec(q, skb);
			q->q.qlen--;
			goto bulk;
142
		}
143 144
		skb = NULL;
		goto trace;
145 146 147 148 149 150 151 152 153 154
	}
	if (!(q->flags & TCQ_F_ONETXQUEUE) ||
	    !netif_xmit_frozen_or_stopped(txq))
		skb = q->dequeue(q);
	if (skb) {
bulk:
		if (qdisc_may_bulk(q))
			try_bulk_dequeue_skb(q, skb, txq, packets);
		else
			try_bulk_dequeue_skb_slow(q, skb, packets);
155
	}
156 157
trace:
	trace_qdisc_dequeue(q, txq, *packets, skb);
158 159 160
	return skb;
}

161
/*
162
 * Transmit possibly several skbs, and handle the return status as
163
 * required. Owning running seqcount bit guarantees that
164
 * only one CPU can execute this function.
165 166
 *
 * Returns to the caller:
167 168
 *				false  - hardware queue frozen backoff
 *				true   - feel free to send more pkts
169
 */
170 171 172
bool sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
		     struct net_device *dev, struct netdev_queue *txq,
		     spinlock_t *root_lock, bool validate)
L
Linus Torvalds 已提交
173
{
174
	int ret = NETDEV_TX_BUSY;
175 176

	/* And release qdisc */
177 178
	if (root_lock)
		spin_unlock(root_lock);
179

180 181 182
	/* Note that we validate skb (GSO, checksum, ...) outside of locks */
	if (validate)
		skb = validate_xmit_skb_list(skb, dev);
183

184
	if (likely(skb)) {
185 186 187
		HARD_TX_LOCK(dev, txq, smp_processor_id());
		if (!netif_xmit_frozen_or_stopped(txq))
			skb = dev_hard_start_xmit(skb, dev, txq, &ret);
188

189
		HARD_TX_UNLOCK(dev, txq);
190
	} else {
191 192
		if (root_lock)
			spin_lock(root_lock);
193
		return true;
194
	}
195 196 197

	if (root_lock)
		spin_lock(root_lock);
198

199
	if (!dev_xmit_complete(ret)) {
200
		/* Driver returned NETDEV_TX_BUSY - requeue skb */
201 202 203
		if (unlikely(ret != NETDEV_TX_BUSY))
			net_warn_ratelimited("BUG %s code %d qlen %d\n",
					     dev->name, ret, q->q.qlen);
204

205 206
		dev_requeue_skb(skb, q);
		return false;
207
	}
208

209
	if (ret && netif_xmit_frozen_or_stopped(txq))
210
		return false;
211

212
	return true;
L
Linus Torvalds 已提交
213 214
}

215 216 217
/*
 * NOTE: Called under qdisc_lock(q) with locally disabled BH.
 *
218
 * running seqcount guarantees only one CPU can process
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 * this qdisc at a time. qdisc_lock(q) serializes queue accesses for
 * this queue.
 *
 *  netif_tx_lock serializes accesses to device driver.
 *
 *  qdisc_lock(q) and netif_tx_lock are mutually exclusive,
 *  if one is grabbed, another must be free.
 *
 * Note, that this procedure can be called by a watchdog timer
 *
 * Returns to the caller:
 *				0  - queue is empty or throttled.
 *				>0 - queue is not empty.
 *
 */
234
static inline bool qdisc_restart(struct Qdisc *q, int *packets)
235
{
236
	spinlock_t *root_lock = NULL;
237 238 239
	struct netdev_queue *txq;
	struct net_device *dev;
	struct sk_buff *skb;
240
	bool validate;
241 242

	/* Dequeue packet */
243
	skb = dequeue_skb(q, &validate, packets);
244
	if (unlikely(!skb))
245
		return false;
246

247 248 249
	if (!(q->flags & TCQ_F_NOLOCK))
		root_lock = qdisc_lock(q);

250
	dev = qdisc_dev(q);
251
	txq = skb_get_tx_queue(dev, skb);
252

253
	return sch_direct_xmit(skb, q, dev, txq, root_lock, validate);
254 255
}

256
void __qdisc_run(struct Qdisc *q)
H
Herbert Xu 已提交
257
{
258
	int quota = dev_tx_weight;
259
	int packets;
260

261
	while (qdisc_restart(q, &packets)) {
262
		/*
J
jamal 已提交
263 264 265
		 * Ordered by possible occurrence: Postpone processing if
		 * 1. we've exceeded packet quota
		 * 2. another process needs the CPU;
266
		 */
267 268
		quota -= packets;
		if (quota <= 0 || need_resched()) {
269
			__netif_schedule(q);
270
			break;
271 272
		}
	}
H
Herbert Xu 已提交
273 274
}

275 276
unsigned long dev_trans_start(struct net_device *dev)
{
277
	unsigned long val, res;
278 279
	unsigned int i;

280 281
	if (is_vlan_dev(dev))
		dev = vlan_dev_real_dev(dev);
F
Florian Westphal 已提交
282 283
	res = netdev_get_tx_queue(dev, 0)->trans_start;
	for (i = 1; i < dev->num_tx_queues; i++) {
284 285 286 287
		val = netdev_get_tx_queue(dev, i)->trans_start;
		if (val && time_after(val, res))
			res = val;
	}
288

289 290 291 292
	return res;
}
EXPORT_SYMBOL(dev_trans_start);

293
static void dev_watchdog(struct timer_list *t)
L
Linus Torvalds 已提交
294
{
295
	struct net_device *dev = from_timer(dev, t, watchdog_timer);
L
Linus Torvalds 已提交
296

H
Herbert Xu 已提交
297
	netif_tx_lock(dev);
298
	if (!qdisc_tx_is_noop(dev)) {
L
Linus Torvalds 已提交
299 300 301
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
302
			int some_queue_timedout = 0;
303
			unsigned int i;
304
			unsigned long trans_start;
305 306 307 308 309

			for (i = 0; i < dev->num_tx_queues; i++) {
				struct netdev_queue *txq;

				txq = netdev_get_tx_queue(dev, i);
F
Florian Westphal 已提交
310
				trans_start = txq->trans_start;
311
				if (netif_xmit_stopped(txq) &&
312 313 314
				    time_after(jiffies, (trans_start +
							 dev->watchdog_timeo))) {
					some_queue_timedout = 1;
315
					txq->trans_timeout++;
316 317 318
					break;
				}
			}
319

320 321
			if (some_queue_timedout) {
				WARN_ONCE(1, KERN_INFO "NETDEV WATCHDOG: %s (%s): transmit queue %u timed out\n",
322
				       dev->name, netdev_drivername(dev), i);
323
				dev->netdev_ops->ndo_tx_timeout(dev);
L
Linus Torvalds 已提交
324
			}
325 326 327
			if (!mod_timer(&dev->watchdog_timer,
				       round_jiffies(jiffies +
						     dev->watchdog_timeo)))
L
Linus Torvalds 已提交
328 329 330
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
331
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
332 333 334 335 336 337

	dev_put(dev);
}

void __netdev_watchdog_up(struct net_device *dev)
{
338
	if (dev->netdev_ops->ndo_tx_timeout) {
L
Linus Torvalds 已提交
339 340
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
341 342
		if (!mod_timer(&dev->watchdog_timer,
			       round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
343 344 345 346 347 348 349 350 351 352 353
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
354
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
355
	if (del_timer(&dev->watchdog_timer))
356
		dev_put(dev);
H
Herbert Xu 已提交
357
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
358 359
}

360 361 362 363 364 365
/**
 *	netif_carrier_on - set carrier
 *	@dev: network device
 *
 * Device has detected that carrier.
 */
366 367
void netif_carrier_on(struct net_device *dev)
{
J
Jeff Garzik 已提交
368
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
369 370
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
371
		atomic_inc(&dev->carrier_changes);
372
		linkwatch_fire_event(dev);
J
Jeff Garzik 已提交
373 374 375
		if (netif_running(dev))
			__netdev_watchdog_up(dev);
	}
376
}
377
EXPORT_SYMBOL(netif_carrier_on);
378

379 380 381 382 383 384
/**
 *	netif_carrier_off - clear carrier
 *	@dev: network device
 *
 * Device has detected loss of carrier.
 */
385 386
void netif_carrier_off(struct net_device *dev)
{
387 388 389
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state)) {
		if (dev->reg_state == NETREG_UNINITIALIZED)
			return;
390
		atomic_inc(&dev->carrier_changes);
391
		linkwatch_fire_event(dev);
392
	}
393
}
394
EXPORT_SYMBOL(netif_carrier_off);
395

L
Linus Torvalds 已提交
396 397 398 399 400
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

401 402
static int noop_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			struct sk_buff **to_free)
L
Linus Torvalds 已提交
403
{
404
	__qdisc_drop(skb, to_free);
L
Linus Torvalds 已提交
405 406 407
	return NET_XMIT_CN;
}

408
static struct sk_buff *noop_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
409 410 411 412
{
	return NULL;
}

413
struct Qdisc_ops noop_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
414 415 416 417
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
418
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
419 420 421
	.owner		=	THIS_MODULE,
};

422 423
static struct netdev_queue noop_netdev_queue = {
	.qdisc		=	&noop_qdisc,
424
	.qdisc_sleeping	=	&noop_qdisc,
425 426
};

L
Linus Torvalds 已提交
427 428 429 430
struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
431
	.ops		=	&noop_qdisc_ops,
432
	.q.lock		=	__SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock),
433
	.dev_queue	=	&noop_netdev_queue,
434
	.running	=	SEQCNT_ZERO(noop_qdisc.running),
435
	.busylock	=	__SPIN_LOCK_UNLOCKED(noop_qdisc.busylock),
L
Linus Torvalds 已提交
436
};
437
EXPORT_SYMBOL(noop_qdisc);
L
Linus Torvalds 已提交
438

P
Phil Sutter 已提交
439 440 441 442 443 444 445 446 447 448
static int noqueue_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	/* register_qdisc() assigns a default of noop_enqueue if unset,
	 * but __dev_queue_xmit() treats noqueue only as such
	 * if this is NULL - so clear it here. */
	qdisc->enqueue = NULL;
	return 0;
}

struct Qdisc_ops noqueue_qdisc_ops __read_mostly = {
L
Linus Torvalds 已提交
449 450
	.id		=	"noqueue",
	.priv_size	=	0,
P
Phil Sutter 已提交
451
	.init		=	noqueue_init,
L
Linus Torvalds 已提交
452 453
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
454
	.peek		=	noop_dequeue,
L
Linus Torvalds 已提交
455 456 457
	.owner		=	THIS_MODULE,
};

E
Eric Dumazet 已提交
458 459 460
static const u8 prio2band[TC_PRIO_MAX + 1] = {
	1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1
};
461 462 463 464 465 466 467

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

#define PFIFO_FAST_BANDS 3

468 469 470 471 472 473 474
/*
 * Private data for a pfifo_fast scheduler containing:
 * 	- queues for the three band
 * 	- bitmap indicating which of the bands contain skbs
 */
struct pfifo_fast_priv {
	u32 bitmap;
475
	struct qdisc_skb_head q[PFIFO_FAST_BANDS];
476 477 478 479 480 481 482 483 484 485
};

/*
 * Convert a bitmap to the first band number where an skb is queued, where:
 * 	bitmap=0 means there are no skbs on any band.
 * 	bitmap=1 means there is an skb on band 0.
 *	bitmap=7 means there are skbs on all 3 bands, etc.
 */
static const int bitmap2band[] = {-1, 0, 1, 0, 2, 0, 1, 0};

486
static inline struct qdisc_skb_head *band2list(struct pfifo_fast_priv *priv,
487
					     int band)
488
{
489
	return priv->q + band;
490 491
}

492 493
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc *qdisc,
			      struct sk_buff **to_free)
494
{
495
	if (qdisc->q.qlen < qdisc_dev(qdisc)->tx_queue_len) {
496 497
		int band = prio2band[skb->priority & TC_PRIO_MAX];
		struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
498
		struct qdisc_skb_head *list = band2list(priv, band);
L
Linus Torvalds 已提交
499

500
		priv->bitmap |= (1 << band);
501
		qdisc->q.qlen++;
502
		return __qdisc_enqueue_tail(skb, qdisc, list);
503
	}
504

505
	return qdisc_drop(skb, qdisc, to_free);
L
Linus Torvalds 已提交
506 507
}

E
Eric Dumazet 已提交
508
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
509
{
510 511
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];
L
Linus Torvalds 已提交
512

513
	if (likely(band >= 0)) {
514 515
		struct qdisc_skb_head *qh = band2list(priv, band);
		struct sk_buff *skb = __qdisc_dequeue_head(qh);
516 517 518 519 520

		if (likely(skb != NULL)) {
			qdisc_qstats_backlog_dec(qdisc, skb);
			qdisc_bstats_update(qdisc, skb);
		}
521 522

		qdisc->q.qlen--;
523
		if (qh->qlen == 0)
524 525 526
			priv->bitmap &= ~(1 << band);

		return skb;
527
	}
528

L
Linus Torvalds 已提交
529 530 531
	return NULL;
}

E
Eric Dumazet 已提交
532
static struct sk_buff *pfifo_fast_peek(struct Qdisc *qdisc)
533
{
534 535 536 537
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
	int band = bitmap2band[priv->bitmap];

	if (band >= 0) {
538
		struct qdisc_skb_head *qh = band2list(priv, band);
539

540
		return qh->head;
541 542 543 544 545
	}

	return NULL;
}

E
Eric Dumazet 已提交
546
static void pfifo_fast_reset(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
547
{
548
	int prio;
549
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
550 551

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
552
		__qdisc_reset_queue(band2list(priv, prio));
553

554
	priv->bitmap = 0;
555
	qdisc->qstats.backlog = 0;
556
	qdisc->q.qlen = 0;
L
Linus Torvalds 已提交
557 558
}

559 560 561 562
static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };

E
Eric Dumazet 已提交
563
	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX + 1);
564 565
	if (nla_put(skb, TCA_OPTIONS, sizeof(opt), &opt))
		goto nla_put_failure;
566 567 568 569 570 571 572 573 574
	return skb->len;

nla_put_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct nlattr *opt)
{
	int prio;
575
	struct pfifo_fast_priv *priv = qdisc_priv(qdisc);
576 577

	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
578
		qdisc_skb_head_init(band2list(priv, prio));
579

580 581
	/* Can by-pass the queue discipline */
	qdisc->flags |= TCQ_F_CAN_BYPASS;
582 583 584
	return 0;
}

585
struct Qdisc_ops pfifo_fast_ops __read_mostly = {
586
	.id		=	"pfifo_fast",
587
	.priv_size	=	sizeof(struct pfifo_fast_priv),
588 589
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
590
	.peek		=	pfifo_fast_peek,
591 592 593
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
L
Linus Torvalds 已提交
594 595
	.owner		=	THIS_MODULE,
};
596
EXPORT_SYMBOL(pfifo_fast_ops);
L
Linus Torvalds 已提交
597

598
static struct lock_class_key qdisc_tx_busylock;
599
static struct lock_class_key qdisc_running_key;
600

601
struct Qdisc *qdisc_alloc(struct netdev_queue *dev_queue,
602
			  const struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
603 604 605
{
	void *p;
	struct Qdisc *sch;
E
Eric Dumazet 已提交
606
	unsigned int size = QDISC_ALIGN(sizeof(*sch)) + ops->priv_size;
607
	int err = -ENOBUFS;
608 609 610 611 612 613
	struct net_device *dev;

	if (!dev_queue) {
		err = -EINVAL;
		goto errout;
	}
L
Linus Torvalds 已提交
614

615
	dev = dev_queue->dev;
616 617 618
	p = kzalloc_node(size, GFP_KERNEL,
			 netdev_queue_numa_node_read(dev_queue));

L
Linus Torvalds 已提交
619
	if (!p)
620 621
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
E
Eric Dumazet 已提交
622 623 624 625 626 627 628 629 630 631
	/* if we got non aligned memory, ask more and do alignment ourself */
	if (sch != p) {
		kfree(p);
		p = kzalloc_node(size + QDISC_ALIGNTO - 1, GFP_KERNEL,
				 netdev_queue_numa_node_read(dev_queue));
		if (!p)
			goto errout;
		sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
		sch->padded = (char *) sch - (char *) p;
	}
632 633
	qdisc_skb_head_init(&sch->q);
	spin_lock_init(&sch->q.lock);
634

635 636 637 638 639 640 641 642 643 644 645 646 647
	if (ops->static_flags & TCQ_F_CPUSTATS) {
		sch->cpu_bstats =
			netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
		if (!sch->cpu_bstats)
			goto errout1;

		sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
		if (!sch->cpu_qstats) {
			free_percpu(sch->cpu_bstats);
			goto errout1;
		}
	}

648
	spin_lock_init(&sch->busylock);
649 650 651
	lockdep_set_class(&sch->busylock,
			  dev->qdisc_tx_busylock ?: &qdisc_tx_busylock);

652 653 654 655
	seqcount_init(&sch->running);
	lockdep_set_class(&sch->running,
			  dev->qdisc_running_key ?: &qdisc_running_key);

L
Linus Torvalds 已提交
656
	sch->ops = ops;
657
	sch->flags = ops->static_flags;
L
Linus Torvalds 已提交
658 659
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
660
	sch->dev_queue = dev_queue;
661
	dev_hold(dev);
662
	refcount_set(&sch->refcnt, 1);
663 664

	return sch;
665 666
errout1:
	kfree(p);
667
errout:
668
	return ERR_PTR(err);
669 670
}

671
struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue,
672 673
				const struct Qdisc_ops *ops,
				unsigned int parentid)
674 675
{
	struct Qdisc *sch;
676

677
	if (!try_module_get(ops->owner))
678
		return NULL;
679

680
	sch = qdisc_alloc(dev_queue, ops);
681 682 683 684
	if (IS_ERR(sch)) {
		module_put(ops->owner);
		return NULL;
	}
685
	sch->parent = parentid;
686

L
Linus Torvalds 已提交
687 688 689
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

690
	qdisc_destroy(sch);
L
Linus Torvalds 已提交
691 692
	return NULL;
}
693
EXPORT_SYMBOL(qdisc_create_dflt);
L
Linus Torvalds 已提交
694

695
/* Under qdisc_lock(qdisc) and BH! */
L
Linus Torvalds 已提交
696 697 698

void qdisc_reset(struct Qdisc *qdisc)
{
699
	const struct Qdisc_ops *ops = qdisc->ops;
L
Linus Torvalds 已提交
700 701 702

	if (ops->reset)
		ops->reset(qdisc);
703

704 705 706
	kfree_skb(qdisc->skb_bad_txq);
	qdisc->skb_bad_txq = NULL;

707
	if (qdisc->gso_skb) {
708
		kfree_skb_list(qdisc->gso_skb);
709 710
		qdisc->gso_skb = NULL;
	}
711
	qdisc->q.qlen = 0;
712
	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
713
}
714
EXPORT_SYMBOL(qdisc_reset);
L
Linus Torvalds 已提交
715

716
static void qdisc_free(struct Qdisc *qdisc)
E
Eric Dumazet 已提交
717
{
718
	if (qdisc_is_percpu_stats(qdisc)) {
719
		free_percpu(qdisc->cpu_bstats);
720 721
		free_percpu(qdisc->cpu_qstats);
	}
722

E
Eric Dumazet 已提交
723 724 725
	kfree((char *) qdisc - qdisc->padded);
}

726
void qdisc_destroy(struct Qdisc *qdisc)
L
Linus Torvalds 已提交
727
{
728 729
	const struct Qdisc_ops  *ops = qdisc->ops;

730
	if (qdisc->flags & TCQ_F_BUILTIN ||
731
	    !refcount_dec_and_test(&qdisc->refcnt))
732 733
		return;

734
#ifdef CONFIG_NET_SCHED
735
	qdisc_hash_del(qdisc);
736

E
Eric Dumazet 已提交
737
	qdisc_put_stab(rtnl_dereference(qdisc->stab));
738
#endif
739
	gen_kill_estimator(&qdisc->rate_est);
740 741 742 743 744 745 746 747
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);

	module_put(ops->owner);
	dev_put(qdisc_dev(qdisc));

748
	kfree_skb_list(qdisc->gso_skb);
749
	kfree_skb(qdisc->skb_bad_txq);
750
	qdisc_free(qdisc);
L
Linus Torvalds 已提交
751
}
752
EXPORT_SYMBOL(qdisc_destroy);
L
Linus Torvalds 已提交
753

754 755 756 757 758 759 760 761 762 763 764
/* Attach toplevel qdisc to device queue. */
struct Qdisc *dev_graft_qdisc(struct netdev_queue *dev_queue,
			      struct Qdisc *qdisc)
{
	struct Qdisc *oqdisc = dev_queue->qdisc_sleeping;
	spinlock_t *root_lock;

	root_lock = qdisc_lock(oqdisc);
	spin_lock_bh(root_lock);

	/* Prune old scheduler */
765
	if (oqdisc && refcount_read(&oqdisc->refcnt) <= 1)
766 767 768 769 770 771 772 773 774 775 776 777
		qdisc_reset(oqdisc);

	/* ... and graft new one */
	if (qdisc == NULL)
		qdisc = &noop_qdisc;
	dev_queue->qdisc_sleeping = qdisc;
	rcu_assign_pointer(dev_queue->qdisc, &noop_qdisc);

	spin_unlock_bh(root_lock);

	return oqdisc;
}
778
EXPORT_SYMBOL(dev_graft_qdisc);
779

780 781 782 783
static void attach_one_default_qdisc(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_unused)
{
784 785
	struct Qdisc *qdisc;
	const struct Qdisc_ops *ops = default_qdisc_ops;
786

787 788 789 790 791 792 793
	if (dev->priv_flags & IFF_NO_QUEUE)
		ops = &noqueue_qdisc_ops;

	qdisc = qdisc_create_dflt(dev_queue, ops, TC_H_ROOT);
	if (!qdisc) {
		netdev_info(dev, "activation failed\n");
		return;
794
	}
795
	if (!netif_is_multiqueue(dev))
796
		qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
797 798 799
	dev_queue->qdisc_sleeping = qdisc;
}

800 801 802 803 804 805 806
static void attach_default_qdiscs(struct net_device *dev)
{
	struct netdev_queue *txq;
	struct Qdisc *qdisc;

	txq = netdev_get_tx_queue(dev, 0);

807 808
	if (!netif_is_multiqueue(dev) ||
	    dev->priv_flags & IFF_NO_QUEUE) {
809 810
		netdev_for_each_tx_queue(dev, attach_one_default_qdisc, NULL);
		dev->qdisc = txq->qdisc_sleeping;
811
		qdisc_refcount_inc(dev->qdisc);
812
	} else {
813
		qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT);
814 815
		if (qdisc) {
			dev->qdisc = qdisc;
816
			qdisc->ops->attach(qdisc);
817 818
		}
	}
819
#ifdef CONFIG_NET_SCHED
820
	if (dev->qdisc != &noop_qdisc)
821
		qdisc_hash_add(dev->qdisc, false);
822
#endif
823 824
}

825 826 827 828
static void transition_one_qdisc(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_need_watchdog)
{
829
	struct Qdisc *new_qdisc = dev_queue->qdisc_sleeping;
830 831
	int *need_watchdog_p = _need_watchdog;

832 833 834
	if (!(new_qdisc->flags & TCQ_F_BUILTIN))
		clear_bit(__QDISC_STATE_DEACTIVATED, &new_qdisc->state);

835
	rcu_assign_pointer(dev_queue->qdisc, new_qdisc);
836
	if (need_watchdog_p) {
837
		dev_queue->trans_start = 0;
838
		*need_watchdog_p = 1;
839
	}
840 841
}

L
Linus Torvalds 已提交
842 843
void dev_activate(struct net_device *dev)
{
844
	int need_watchdog;
845

L
Linus Torvalds 已提交
846
	/* No queueing discipline is attached to device;
847 848
	 * create default one for devices, which need queueing
	 * and noqueue_qdisc for virtual interfaces
L
Linus Torvalds 已提交
849 850
	 */

851 852
	if (dev->qdisc == &noop_qdisc)
		attach_default_qdiscs(dev);
853

854 855 856 857
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

858 859
	need_watchdog = 0;
	netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog);
860 861
	if (dev_ingress_queue(dev))
		transition_one_qdisc(dev, dev_ingress_queue(dev), NULL);
862 863

	if (need_watchdog) {
864
		netif_trans_update(dev);
L
Linus Torvalds 已提交
865 866
		dev_watchdog_up(dev);
	}
867
}
868
EXPORT_SYMBOL(dev_activate);
869

870 871 872
static void dev_deactivate_queue(struct net_device *dev,
				 struct netdev_queue *dev_queue,
				 void *_qdisc_default)
873
{
874
	struct Qdisc *qdisc_default = _qdisc_default;
875 876
	struct Qdisc *qdisc;

877
	qdisc = rtnl_dereference(dev_queue->qdisc);
878
	if (qdisc) {
879 880
		spin_lock_bh(qdisc_lock(qdisc));

881 882 883
		if (!(qdisc->flags & TCQ_F_BUILTIN))
			set_bit(__QDISC_STATE_DEACTIVATED, &qdisc->state);

884
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
885
		qdisc_reset(qdisc);
886

887
		spin_unlock_bh(qdisc_lock(qdisc));
888
	}
L
Linus Torvalds 已提交
889 890
}

891
static bool some_qdisc_is_busy(struct net_device *dev)
892 893 894 895 896
{
	unsigned int i;

	for (i = 0; i < dev->num_tx_queues; i++) {
		struct netdev_queue *dev_queue;
897
		spinlock_t *root_lock;
898
		struct Qdisc *q;
899 900 901
		int val;

		dev_queue = netdev_get_tx_queue(dev, i);
902
		q = dev_queue->qdisc_sleeping;
903

904 905 906 907 908
		if (q->flags & TCQ_F_NOLOCK) {
			val = test_bit(__QDISC_STATE_SCHED, &q->state);
		} else {
			root_lock = qdisc_lock(q);
			spin_lock_bh(root_lock);
909

910 911
			val = (qdisc_is_running(q) ||
			       test_bit(__QDISC_STATE_SCHED, &q->state));
912

913 914
			spin_unlock_bh(root_lock);
		}
915 916 917 918 919 920 921

		if (val)
			return true;
	}
	return false;
}

922 923 924 925 926 927 928
/**
 * 	dev_deactivate_many - deactivate transmissions on several devices
 * 	@head: list of devices to deactivate
 *
 *	This function returns only when all outstanding transmissions
 *	have completed, unless all devices are in dismantle phase.
 */
929
void dev_deactivate_many(struct list_head *head)
L
Linus Torvalds 已提交
930
{
931
	struct net_device *dev;
932
	bool sync_needed = false;
933

934
	list_for_each_entry(dev, head, close_list) {
935 936 937 938 939 940 941
		netdev_for_each_tx_queue(dev, dev_deactivate_queue,
					 &noop_qdisc);
		if (dev_ingress_queue(dev))
			dev_deactivate_queue(dev, dev_ingress_queue(dev),
					     &noop_qdisc);

		dev_watchdog_down(dev);
942
		sync_needed |= !dev->dismantle;
943
	}
L
Linus Torvalds 已提交
944

945 946 947 948 949 950
	/* Wait for outstanding qdisc-less dev_queue_xmit calls.
	 * This is avoided if all devices are in dismantle phase :
	 * Caller will call synchronize_net() for us
	 */
	if (sync_needed)
		synchronize_net();
L
Linus Torvalds 已提交
951

952
	/* Wait for outstanding qdisc_run calls. */
953
	list_for_each_entry(dev, head, close_list)
954 955 956 957 958 959 960 961
		while (some_qdisc_is_busy(dev))
			yield();
}

void dev_deactivate(struct net_device *dev)
{
	LIST_HEAD(single);

962
	list_add(&dev->close_list, &single);
963
	dev_deactivate_many(&single);
964
	list_del(&single);
L
Linus Torvalds 已提交
965
}
966
EXPORT_SYMBOL(dev_deactivate);
L
Linus Torvalds 已提交
967

968 969
static void dev_init_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
970
				     void *_qdisc)
971
{
972 973
	struct Qdisc *qdisc = _qdisc;

974
	rcu_assign_pointer(dev_queue->qdisc, qdisc);
975 976 977
	dev_queue->qdisc_sleeping = qdisc;
}

L
Linus Torvalds 已提交
978 979
void dev_init_scheduler(struct net_device *dev)
{
980
	dev->qdisc = &noop_qdisc;
981
	netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
982 983
	if (dev_ingress_queue(dev))
		dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
L
Linus Torvalds 已提交
984

985
	timer_setup(&dev->watchdog_timer, dev_watchdog, 0);
L
Linus Torvalds 已提交
986 987
}

988 989 990
static void shutdown_scheduler_queue(struct net_device *dev,
				     struct netdev_queue *dev_queue,
				     void *_qdisc_default)
L
Linus Torvalds 已提交
991
{
992
	struct Qdisc *qdisc = dev_queue->qdisc_sleeping;
993
	struct Qdisc *qdisc_default = _qdisc_default;
994 995

	if (qdisc) {
996
		rcu_assign_pointer(dev_queue->qdisc, qdisc_default);
997
		dev_queue->qdisc_sleeping = qdisc_default;
L
Linus Torvalds 已提交
998 999

		qdisc_destroy(qdisc);
1000
	}
1001 1002 1003 1004
}

void dev_shutdown(struct net_device *dev)
{
1005
	netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc);
1006 1007
	if (dev_ingress_queue(dev))
		shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc);
1008 1009 1010
	qdisc_destroy(dev->qdisc);
	dev->qdisc = &noop_qdisc;

1011
	WARN_ON(timer_pending(&dev->watchdog_timer));
L
Linus Torvalds 已提交
1012
}
1013

1014
void psched_ratecfg_precompute(struct psched_ratecfg *r,
1015 1016
			       const struct tc_ratespec *conf,
			       u64 rate64)
1017
{
1018 1019
	memset(r, 0, sizeof(*r));
	r->overhead = conf->overhead;
1020
	r->rate_bytes_ps = max_t(u64, conf->rate, rate64);
1021
	r->linklayer = (conf->linklayer & TC_LINKLAYER_MASK);
1022 1023
	r->mult = 1;
	/*
1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034
	 * The deal here is to replace a divide by a reciprocal one
	 * in fast path (a reciprocal divide is a multiply and a shift)
	 *
	 * Normal formula would be :
	 *  time_in_ns = (NSEC_PER_SEC * len) / rate_bps
	 *
	 * We compute mult/shift to use instead :
	 *  time_in_ns = (len * mult) >> shift;
	 *
	 * We try to get the highest possible mult value for accuracy,
	 * but have to make sure no overflows will ever happen.
1035
	 */
1036 1037 1038 1039 1040 1041
	if (r->rate_bytes_ps > 0) {
		u64 factor = NSEC_PER_SEC;

		for (;;) {
			r->mult = div64_u64(factor, r->rate_bytes_ps);
			if (r->mult & (1U << 31) || factor & (1ULL << 63))
1042
				break;
1043 1044
			factor <<= 1;
			r->shift++;
1045 1046 1047 1048
		}
	}
}
EXPORT_SYMBOL(psched_ratecfg_precompute);
1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094

static void mini_qdisc_rcu_func(struct rcu_head *head)
{
}

void mini_qdisc_pair_swap(struct mini_Qdisc_pair *miniqp,
			  struct tcf_proto *tp_head)
{
	struct mini_Qdisc *miniq_old = rtnl_dereference(*miniqp->p_miniq);
	struct mini_Qdisc *miniq;

	if (!tp_head) {
		RCU_INIT_POINTER(*miniqp->p_miniq, NULL);
		return;
	}

	miniq = !miniq_old || miniq_old == &miniqp->miniq2 ?
		&miniqp->miniq1 : &miniqp->miniq2;

	/* We need to make sure that readers won't see the miniq
	 * we are about to modify. So wait until previous call_rcu_bh callback
	 * is done.
	 */
	rcu_barrier_bh();
	miniq->filter_list = tp_head;
	rcu_assign_pointer(*miniqp->p_miniq, miniq);

	if (miniq_old)
		/* This is counterpart of the rcu barrier above. We need to
		 * block potential new user of miniq_old until all readers
		 * are not seeing it.
		 */
		call_rcu_bh(&miniq_old->rcu, mini_qdisc_rcu_func);
}
EXPORT_SYMBOL(mini_qdisc_pair_swap);

void mini_qdisc_pair_init(struct mini_Qdisc_pair *miniqp, struct Qdisc *qdisc,
			  struct mini_Qdisc __rcu **p_miniq)
{
	miniqp->miniq1.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq1.cpu_qstats = qdisc->cpu_qstats;
	miniqp->miniq2.cpu_bstats = qdisc->cpu_bstats;
	miniqp->miniq2.cpu_qstats = qdisc->cpu_qstats;
	miniqp->p_miniq = p_miniq;
}
EXPORT_SYMBOL(mini_qdisc_pair_init);