sch_generic.c 13.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38
/*
 * net/sched/sch_generic.c	Generic packet scheduler routines.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 *              Jamal Hadi Salim, <hadi@cyberus.ca> 990601
 *              - Ingress support
 */

#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/in.h>
#include <linux/errno.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <linux/skbuff.h>
#include <linux/rtnetlink.h>
#include <linux/init.h>
#include <linux/rcupdate.h>
#include <linux/list.h>
#include <net/sock.h>
#include <net/pkt_sched.h>

/* Main transmission queue. */

39 40 41 42 43 44 45
/* Modifications to data participating in scheduling must be protected with
 * dev->queue_lock spinlock.
 *
 * The idea is the following:
 * - enqueue, dequeue are serialized via top level device
 *   spinlock dev->queue_lock.
 * - updates to tree and tree walking are only done under the rtnl mutex.
L
Linus Torvalds 已提交
46 47 48 49 50 51 52 53 54 55 56 57
 */

void qdisc_lock_tree(struct net_device *dev)
{
	spin_lock_bh(&dev->queue_lock);
}

void qdisc_unlock_tree(struct net_device *dev)
{
	spin_unlock_bh(&dev->queue_lock);
}

58
/*
L
Linus Torvalds 已提交
59 60 61
   dev->queue_lock serializes queue accesses for this device
   AND dev->qdisc pointer itself.

H
Herbert Xu 已提交
62
   netif_tx_lock serializes accesses to device driver.
L
Linus Torvalds 已提交
63

H
Herbert Xu 已提交
64
   dev->queue_lock and netif_tx_lock are mutually exclusive,
L
Linus Torvalds 已提交
65 66 67 68 69 70 71 72 73
   if one is grabbed, another must be free.
 */


/* Kick device.
   Note, that this procedure can be called by a watchdog timer, so that
   we do not check dev->tbusy flag here.

   Returns:  0  - queue is empty.
74
	    >0  - queue is not empty, but throttled.
L
Linus Torvalds 已提交
75 76 77 78 79
	    <0  - queue is not empty. Device is throttled, if dev->tbusy != 0.

   NOTE: Called under dev->queue_lock with locally disabled BH.
*/

H
Herbert Xu 已提交
80
static inline int qdisc_restart(struct net_device *dev)
L
Linus Torvalds 已提交
81 82 83 84 85
{
	struct Qdisc *q = dev->qdisc;
	struct sk_buff *skb;

	/* Dequeue packet */
86
	if (((skb = dev->gso_skb)) || ((skb = q->dequeue(q)))) {
L
Linus Torvalds 已提交
87
		unsigned nolock = (dev->features & NETIF_F_LLTX);
88 89 90

		dev->gso_skb = NULL;

L
Linus Torvalds 已提交
91 92 93 94 95 96 97 98 99 100
		/*
		 * When the driver has LLTX set it does its own locking
		 * in start_xmit. No need to add additional overhead by
		 * locking again. These checks are worth it because
		 * even uncongested locks can be quite expensive.
		 * The driver can do trylock like here too, in case
		 * of lock congestion it should return -1 and the packet
		 * will be requeued.
		 */
		if (!nolock) {
H
Herbert Xu 已提交
101
			if (!netif_tx_trylock(dev)) {
L
Linus Torvalds 已提交
102 103
			collision:
				/* So, someone grabbed the driver. */
104

L
Linus Torvalds 已提交
105 106 107 108 109 110 111 112 113 114 115 116 117 118 119
				/* It may be transient configuration error,
				   when hard_start_xmit() recurses. We detect
				   it by checking xmit owner and drop the
				   packet when deadloop is detected.
				*/
				if (dev->xmit_lock_owner == smp_processor_id()) {
					kfree_skb(skb);
					if (net_ratelimit())
						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
					return -1;
				}
				__get_cpu_var(netdev_rx_stat).cpu_collision++;
				goto requeue;
			}
		}
120

L
Linus Torvalds 已提交
121 122 123 124 125 126 127
		{
			/* And release queue */
			spin_unlock(&dev->queue_lock);

			if (!netif_queue_stopped(dev)) {
				int ret;

128
				ret = dev_hard_start_xmit(skb, dev);
129
				if (ret == NETDEV_TX_OK) {
L
Linus Torvalds 已提交
130
					if (!nolock) {
H
Herbert Xu 已提交
131
						netif_tx_unlock(dev);
L
Linus Torvalds 已提交
132 133 134 135 136 137
					}
					spin_lock(&dev->queue_lock);
					return -1;
				}
				if (ret == NETDEV_TX_LOCKED && nolock) {
					spin_lock(&dev->queue_lock);
138
					goto collision;
L
Linus Torvalds 已提交
139 140 141 142 143
				}
			}

			/* NETDEV_TX_BUSY - we need to requeue */
			/* Release the driver */
144
			if (!nolock) {
H
Herbert Xu 已提交
145
				netif_tx_unlock(dev);
146
			}
L
Linus Torvalds 已提交
147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
			spin_lock(&dev->queue_lock);
			q = dev->qdisc;
		}

		/* Device kicked us out :(
		   This is possible in three cases:

		   0. driver is locked
		   1. fastroute is enabled
		   2. device cannot determine busy state
		      before start of transmission (f.e. dialout)
		   3. device is buggy (ppp)
		 */

requeue:
162 163 164 165
		if (skb->next)
			dev->gso_skb = skb;
		else
			q->ops->requeue(skb, q);
L
Linus Torvalds 已提交
166 167 168
		netif_schedule(dev);
		return 1;
	}
169
	BUG_ON((int) q->q.qlen < 0);
L
Linus Torvalds 已提交
170 171 172
	return q->q.qlen;
}

H
Herbert Xu 已提交
173 174
void __qdisc_run(struct net_device *dev)
{
175 176 177
	if (unlikely(dev->qdisc == &noop_qdisc))
		goto out;

H
Herbert Xu 已提交
178 179 180
	while (qdisc_restart(dev) < 0 && !netif_queue_stopped(dev))
		/* NOTHING */;

181
out:
H
Herbert Xu 已提交
182 183 184
	clear_bit(__LINK_STATE_QDISC_RUNNING, &dev->state);
}

L
Linus Torvalds 已提交
185 186 187 188
static void dev_watchdog(unsigned long arg)
{
	struct net_device *dev = (struct net_device *)arg;

H
Herbert Xu 已提交
189
	netif_tx_lock(dev);
L
Linus Torvalds 已提交
190 191 192 193 194
	if (dev->qdisc != &noop_qdisc) {
		if (netif_device_present(dev) &&
		    netif_running(dev) &&
		    netif_carrier_ok(dev)) {
			if (netif_queue_stopped(dev) &&
195 196 197 198
			    time_after(jiffies, dev->trans_start + dev->watchdog_timeo)) {

				printk(KERN_INFO "NETDEV WATCHDOG: %s: transmit timed out\n",
				       dev->name);
L
Linus Torvalds 已提交
199 200
				dev->tx_timeout(dev);
			}
201
			if (!mod_timer(&dev->watchdog_timer, round_jiffies(jiffies + dev->watchdog_timeo)))
L
Linus Torvalds 已提交
202 203 204
				dev_hold(dev);
		}
	}
H
Herbert Xu 已提交
205
	netif_tx_unlock(dev);
L
Linus Torvalds 已提交
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233

	dev_put(dev);
}

static void dev_watchdog_init(struct net_device *dev)
{
	init_timer(&dev->watchdog_timer);
	dev->watchdog_timer.data = (unsigned long)dev;
	dev->watchdog_timer.function = dev_watchdog;
}

void __netdev_watchdog_up(struct net_device *dev)
{
	if (dev->tx_timeout) {
		if (dev->watchdog_timeo <= 0)
			dev->watchdog_timeo = 5*HZ;
		if (!mod_timer(&dev->watchdog_timer, jiffies + dev->watchdog_timeo))
			dev_hold(dev);
	}
}

static void dev_watchdog_up(struct net_device *dev)
{
	__netdev_watchdog_up(dev);
}

static void dev_watchdog_down(struct net_device *dev)
{
H
Herbert Xu 已提交
234
	netif_tx_lock_bh(dev);
L
Linus Torvalds 已提交
235
	if (del_timer(&dev->watchdog_timer))
236
		dev_put(dev);
H
Herbert Xu 已提交
237
	netif_tx_unlock_bh(dev);
L
Linus Torvalds 已提交
238 239
}

240 241 242 243 244 245 246 247 248 249 250 251 252 253
void netif_carrier_on(struct net_device *dev)
{
	if (test_and_clear_bit(__LINK_STATE_NOCARRIER, &dev->state))
		linkwatch_fire_event(dev);
	if (netif_running(dev))
		__netdev_watchdog_up(dev);
}

void netif_carrier_off(struct net_device *dev)
{
	if (!test_and_set_bit(__LINK_STATE_NOCARRIER, &dev->state))
		linkwatch_fire_event(dev);
}

L
Linus Torvalds 已提交
254 255 256 257 258
/* "NOOP" scheduler: the best scheduler, recommended for all interfaces
   under all circumstances. It is difficult to invent anything faster or
   cheaper.
 */

259
static int noop_enqueue(struct sk_buff *skb, struct Qdisc * qdisc)
L
Linus Torvalds 已提交
260 261 262 263 264
{
	kfree_skb(skb);
	return NET_XMIT_CN;
}

265
static struct sk_buff *noop_dequeue(struct Qdisc * qdisc)
L
Linus Torvalds 已提交
266 267 268 269
{
	return NULL;
}

270
static int noop_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
L
Linus Torvalds 已提交
271 272
{
	if (net_ratelimit())
273 274
		printk(KERN_DEBUG "%s deferred output. It is buggy.\n",
		       skb->dev->name);
L
Linus Torvalds 已提交
275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291
	kfree_skb(skb);
	return NET_XMIT_CN;
}

struct Qdisc_ops noop_qdisc_ops = {
	.id		=	"noop",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.requeue	=	noop_requeue,
	.owner		=	THIS_MODULE,
};

struct Qdisc noop_qdisc = {
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
292
	.ops		=	&noop_qdisc_ops,
L
Linus Torvalds 已提交
293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320
	.list		=	LIST_HEAD_INIT(noop_qdisc.list),
};

static struct Qdisc_ops noqueue_qdisc_ops = {
	.id		=	"noqueue",
	.priv_size	=	0,
	.enqueue	=	noop_enqueue,
	.dequeue	=	noop_dequeue,
	.requeue	=	noop_requeue,
	.owner		=	THIS_MODULE,
};

static struct Qdisc noqueue_qdisc = {
	.enqueue	=	NULL,
	.dequeue	=	noop_dequeue,
	.flags		=	TCQ_F_BUILTIN,
	.ops		=	&noqueue_qdisc_ops,
	.list		=	LIST_HEAD_INIT(noqueue_qdisc.list),
};


static const u8 prio2band[TC_PRIO_MAX+1] =
	{ 1, 2, 2, 2, 1, 2, 0, 0 , 1, 1, 1, 1, 1, 1, 1, 1 };

/* 3-band FIFO queue: old style, but should be a bit faster than
   generic prio+fifo combination.
 */

321 322
#define PFIFO_FAST_BANDS 3

323 324
static inline struct sk_buff_head *prio2list(struct sk_buff *skb,
					     struct Qdisc *qdisc)
L
Linus Torvalds 已提交
325 326
{
	struct sk_buff_head *list = qdisc_priv(qdisc);
327 328
	return list + prio2band[skb->priority & TC_PRIO_MAX];
}
L
Linus Torvalds 已提交
329

330
static int pfifo_fast_enqueue(struct sk_buff *skb, struct Qdisc* qdisc)
331 332
{
	struct sk_buff_head *list = prio2list(skb, qdisc);
L
Linus Torvalds 已提交
333

334
	if (skb_queue_len(list) < qdisc->dev->tx_queue_len) {
L
Linus Torvalds 已提交
335
		qdisc->q.qlen++;
336
		return __qdisc_enqueue_tail(skb, qdisc, list);
L
Linus Torvalds 已提交
337
	}
338 339

	return qdisc_drop(skb, qdisc);
L
Linus Torvalds 已提交
340 341
}

342
static struct sk_buff *pfifo_fast_dequeue(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
343 344 345 346
{
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

347 348
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++) {
		if (!skb_queue_empty(list + prio)) {
L
Linus Torvalds 已提交
349
			qdisc->q.qlen--;
350
			return __qdisc_dequeue_head(qdisc, list + prio);
L
Linus Torvalds 已提交
351 352
		}
	}
353

L
Linus Torvalds 已提交
354 355 356
	return NULL;
}

357
static int pfifo_fast_requeue(struct sk_buff *skb, struct Qdisc* qdisc)
L
Linus Torvalds 已提交
358 359
{
	qdisc->q.qlen++;
360
	return __qdisc_requeue(skb, qdisc, prio2list(skb, qdisc));
L
Linus Torvalds 已提交
361 362
}

363
static void pfifo_fast_reset(struct Qdisc* qdisc)
L
Linus Torvalds 已提交
364 365 366 367
{
	int prio;
	struct sk_buff_head *list = qdisc_priv(qdisc);

368
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
369 370 371
		__qdisc_reset_queue(qdisc, list + prio);

	qdisc->qstats.backlog = 0;
L
Linus Torvalds 已提交
372 373 374 375 376
	qdisc->q.qlen = 0;
}

static int pfifo_fast_dump(struct Qdisc *qdisc, struct sk_buff *skb)
{
377
	struct tc_prio_qopt opt = { .bands = PFIFO_FAST_BANDS };
L
Linus Torvalds 已提交
378 379 380 381 382 383 384 385 386 387 388

	memcpy(&opt.priomap, prio2band, TC_PRIO_MAX+1);
	RTA_PUT(skb, TCA_OPTIONS, sizeof(opt), &opt);
	return skb->len;

rtattr_failure:
	return -1;
}

static int pfifo_fast_init(struct Qdisc *qdisc, struct rtattr *opt)
{
389
	int prio;
L
Linus Torvalds 已提交
390 391
	struct sk_buff_head *list = qdisc_priv(qdisc);

392 393
	for (prio = 0; prio < PFIFO_FAST_BANDS; prio++)
		skb_queue_head_init(list + prio);
L
Linus Torvalds 已提交
394 395 396 397 398 399

	return 0;
}

static struct Qdisc_ops pfifo_fast_ops = {
	.id		=	"pfifo_fast",
400
	.priv_size	=	PFIFO_FAST_BANDS * sizeof(struct sk_buff_head),
L
Linus Torvalds 已提交
401 402 403 404 405 406 407 408 409
	.enqueue	=	pfifo_fast_enqueue,
	.dequeue	=	pfifo_fast_dequeue,
	.requeue	=	pfifo_fast_requeue,
	.init		=	pfifo_fast_init,
	.reset		=	pfifo_fast_reset,
	.dump		=	pfifo_fast_dump,
	.owner		=	THIS_MODULE,
};

410
struct Qdisc *qdisc_alloc(struct net_device *dev, struct Qdisc_ops *ops)
L
Linus Torvalds 已提交
411 412 413
{
	void *p;
	struct Qdisc *sch;
414 415
	unsigned int size;
	int err = -ENOBUFS;
L
Linus Torvalds 已提交
416 417

	/* ensure that the Qdisc and the private data are 32-byte aligned */
418 419
	size = QDISC_ALIGN(sizeof(*sch));
	size += ops->priv_size + (QDISC_ALIGNTO - 1);
L
Linus Torvalds 已提交
420

421
	p = kzalloc(size, GFP_KERNEL);
L
Linus Torvalds 已提交
422
	if (!p)
423 424 425
		goto errout;
	sch = (struct Qdisc *) QDISC_ALIGN((unsigned long) p);
	sch->padded = (char *) sch - (char *) p;
L
Linus Torvalds 已提交
426 427 428 429 430 431 432 433 434 435

	INIT_LIST_HEAD(&sch->list);
	skb_queue_head_init(&sch->q);
	sch->ops = ops;
	sch->enqueue = ops->enqueue;
	sch->dequeue = ops->dequeue;
	sch->dev = dev;
	dev_hold(dev);
	sch->stats_lock = &dev->queue_lock;
	atomic_set(&sch->refcnt, 1);
436 437 438 439 440 441

	return sch;
errout:
	return ERR_PTR(-err);
}

442 443
struct Qdisc * qdisc_create_dflt(struct net_device *dev, struct Qdisc_ops *ops,
				 unsigned int parentid)
444 445
{
	struct Qdisc *sch;
446

447 448 449
	sch = qdisc_alloc(dev, ops);
	if (IS_ERR(sch))
		goto errout;
450
	sch->parent = parentid;
451

L
Linus Torvalds 已提交
452 453 454
	if (!ops->init || ops->init(sch, NULL) == 0)
		return sch;

455
	qdisc_destroy(sch);
456
errout:
L
Linus Torvalds 已提交
457 458 459 460 461 462 463 464 465 466 467 468 469
	return NULL;
}

/* Under dev->queue_lock and BH! */

void qdisc_reset(struct Qdisc *qdisc)
{
	struct Qdisc_ops *ops = qdisc->ops;

	if (ops->reset)
		ops->reset(qdisc);
}

470
/* this is the rcu callback function to clean up a qdisc when there
L
Linus Torvalds 已提交
471 472 473 474 475 476 477 478 479 480 481 482
 * are no further references to it */

static void __qdisc_destroy(struct rcu_head *head)
{
	struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu);
	kfree((char *) qdisc - qdisc->padded);
}

/* Under dev->queue_lock and BH! */

void qdisc_destroy(struct Qdisc *qdisc)
{
483
	struct Qdisc_ops  *ops = qdisc->ops;
L
Linus Torvalds 已提交
484 485

	if (qdisc->flags & TCQ_F_BUILTIN ||
486
	    !atomic_dec_and_test(&qdisc->refcnt))
L
Linus Torvalds 已提交
487 488
		return;

489 490 491 492 493 494 495 496
	list_del(&qdisc->list);
#ifdef CONFIG_NET_ESTIMATOR
	gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
#endif
	if (ops->reset)
		ops->reset(qdisc);
	if (ops->destroy)
		ops->destroy(qdisc);
L
Linus Torvalds 已提交
497

498 499
	module_put(ops->owner);
	dev_put(qdisc->dev);
L
Linus Torvalds 已提交
500 501 502 503 504 505 506 507 508 509 510 511 512 513
	call_rcu(&qdisc->q_rcu, __qdisc_destroy);
}

void dev_activate(struct net_device *dev)
{
	/* No queueing discipline is attached to device;
	   create default one i.e. pfifo_fast for devices,
	   which need queueing and noqueue_qdisc for
	   virtual interfaces
	 */

	if (dev->qdisc_sleeping == &noop_qdisc) {
		struct Qdisc *qdisc;
		if (dev->tx_queue_len) {
514 515
			qdisc = qdisc_create_dflt(dev, &pfifo_fast_ops,
						  TC_H_ROOT);
L
Linus Torvalds 已提交
516 517 518 519 520 521 522 523 524 525 526
			if (qdisc == NULL) {
				printk(KERN_INFO "%s: activation failed\n", dev->name);
				return;
			}
			list_add_tail(&qdisc->list, &dev->qdisc_list);
		} else {
			qdisc =  &noqueue_qdisc;
		}
		dev->qdisc_sleeping = qdisc;
	}

527 528 529 530
	if (!netif_carrier_ok(dev))
		/* Delay activation until next carrier-on event */
		return;

L
Linus Torvalds 已提交
531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553
	spin_lock_bh(&dev->queue_lock);
	rcu_assign_pointer(dev->qdisc, dev->qdisc_sleeping);
	if (dev->qdisc != &noqueue_qdisc) {
		dev->trans_start = jiffies;
		dev_watchdog_up(dev);
	}
	spin_unlock_bh(&dev->queue_lock);
}

void dev_deactivate(struct net_device *dev)
{
	struct Qdisc *qdisc;

	spin_lock_bh(&dev->queue_lock);
	qdisc = dev->qdisc;
	dev->qdisc = &noop_qdisc;

	qdisc_reset(qdisc);

	spin_unlock_bh(&dev->queue_lock);

	dev_watchdog_down(dev);

554 555
	/* Wait for outstanding dev_queue_xmit calls. */
	synchronize_rcu();
L
Linus Torvalds 已提交
556

557 558 559
	/* Wait for outstanding qdisc_run calls. */
	while (test_bit(__LINK_STATE_QDISC_RUNNING, &dev->state))
		yield();
560 561 562 563 564

	if (dev->gso_skb) {
		kfree_skb(dev->gso_skb);
		dev->gso_skb = NULL;
	}
L
Linus Torvalds 已提交
565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
}

void dev_init_scheduler(struct net_device *dev)
{
	qdisc_lock_tree(dev);
	dev->qdisc = &noop_qdisc;
	dev->qdisc_sleeping = &noop_qdisc;
	INIT_LIST_HEAD(&dev->qdisc_list);
	qdisc_unlock_tree(dev);

	dev_watchdog_init(dev);
}

void dev_shutdown(struct net_device *dev)
{
	struct Qdisc *qdisc;

	qdisc_lock_tree(dev);
	qdisc = dev->qdisc_sleeping;
	dev->qdisc = &noop_qdisc;
	dev->qdisc_sleeping = &noop_qdisc;
	qdisc_destroy(qdisc);
#if defined(CONFIG_NET_SCH_INGRESS) || defined(CONFIG_NET_SCH_INGRESS_MODULE)
588
	if ((qdisc = dev->qdisc_ingress) != NULL) {
L
Linus Torvalds 已提交
589 590
		dev->qdisc_ingress = NULL;
		qdisc_destroy(qdisc);
591
	}
L
Linus Torvalds 已提交
592 593 594 595 596
#endif
	BUG_TRAP(!timer_pending(&dev->watchdog_timer));
	qdisc_unlock_tree(dev);
}

597 598
EXPORT_SYMBOL(netif_carrier_on);
EXPORT_SYMBOL(netif_carrier_off);
L
Linus Torvalds 已提交
599 600 601 602 603 604
EXPORT_SYMBOL(noop_qdisc);
EXPORT_SYMBOL(qdisc_create_dflt);
EXPORT_SYMBOL(qdisc_destroy);
EXPORT_SYMBOL(qdisc_reset);
EXPORT_SYMBOL(qdisc_lock_tree);
EXPORT_SYMBOL(qdisc_unlock_tree);