sch_teql.c 12.0 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/* net/sched/sch_teql.c	"True" (or "trivial") link equalizer.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
14
#include <linux/slab.h>
L
Linus Torvalds 已提交
15 16
#include <linux/string.h>
#include <linux/errno.h>
17
#include <linux/if_arp.h>
L
Linus Torvalds 已提交
18 19 20 21
#include <linux/netdevice.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/moduleparam.h>
22 23
#include <net/dst.h>
#include <net/neighbour.h>
L
Linus Torvalds 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
#include <net/pkt_sched.h>

/*
   How to setup it.
   ----------------

   After loading this module you will find a new device teqlN
   and new qdisc with the same name. To join a slave to the equalizer
   you should just set this qdisc on a device f.e.

   # tc qdisc add dev eth0 root teql0
   # tc qdisc add dev eth1 root teql0

   That's all. Full PnP 8)

   Applicability.
   --------------

   1. Slave devices MUST be active devices, i.e., they must raise the tbusy
      signal and generate EOI events. If you want to equalize virtual devices
      like tunnels, use a normal eql device.
   2. This device puts no limitations on physical slave characteristics
      f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
      Certainly, large difference in link speeds will make the resulting
      eqalized link unusable, because of huge packet reordering.
      I estimate an upper useful difference as ~10 times.
   3. If the slave requires address resolution, only protocols using
      neighbour cache (IPv4/IPv6) will work over the equalized link.
      Other protocols are still allowed to use the slave device directly,
      which will not break load balancing, though native slave
      traffic will have the highest priority.  */

E
Eric Dumazet 已提交
56
struct teql_master {
L
Linus Torvalds 已提交
57 58 59 60
	struct Qdisc_ops qops;
	struct net_device *dev;
	struct Qdisc *slaves;
	struct list_head master_list;
E
Eric Dumazet 已提交
61 62 63 64
	unsigned long	tx_bytes;
	unsigned long	tx_packets;
	unsigned long	tx_errors;
	unsigned long	tx_dropped;
L
Linus Torvalds 已提交
65 66
};

E
Eric Dumazet 已提交
67
struct teql_sched_data {
L
Linus Torvalds 已提交
68 69 70 71 72
	struct Qdisc *next;
	struct teql_master *m;
	struct sk_buff_head q;
};

E
Eric Dumazet 已提交
73
#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
L
Linus Torvalds 已提交
74

E
Eric Dumazet 已提交
75
#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
L
Linus Torvalds 已提交
76 77 78 79

/* "teql*" qdisc routines */

static int
E
Eric Dumazet 已提交
80
teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
L
Linus Torvalds 已提交
81
{
82
	struct net_device *dev = qdisc_dev(sch);
L
Linus Torvalds 已提交
83 84
	struct teql_sched_data *q = qdisc_priv(sch);

85 86
	if (q->q.qlen < dev->tx_queue_len) {
		__skb_queue_tail(&q->q, skb);
87
		return NET_XMIT_SUCCESS;
L
Linus Torvalds 已提交
88 89
	}

90
	return qdisc_drop(skb, sch);
L
Linus Torvalds 已提交
91 92 93
}

static struct sk_buff *
E
Eric Dumazet 已提交
94
teql_dequeue(struct Qdisc *sch)
L
Linus Torvalds 已提交
95 96
{
	struct teql_sched_data *dat = qdisc_priv(sch);
97
	struct netdev_queue *dat_queue;
L
Linus Torvalds 已提交
98 99 100
	struct sk_buff *skb;

	skb = __skb_dequeue(&dat->q);
101
	dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
L
Linus Torvalds 已提交
102
	if (skb == NULL) {
103
		struct net_device *m = qdisc_dev(dat_queue->qdisc);
L
Linus Torvalds 已提交
104 105 106 107
		if (m) {
			dat->m->slaves = sch;
			netif_wake_queue(m);
		}
108 109
	} else {
		qdisc_bstats_update(sch, skb);
L
Linus Torvalds 已提交
110
	}
111
	sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
L
Linus Torvalds 已提交
112 113 114
	return skb;
}

115
static struct sk_buff *
E
Eric Dumazet 已提交
116
teql_peek(struct Qdisc *sch)
117 118 119 120 121
{
	/* teql is meant to be used as root qdisc */
	return NULL;
}

E
Eric Dumazet 已提交
122
static inline void
L
Linus Torvalds 已提交
123 124 125 126 127 128 129
teql_neigh_release(struct neighbour *n)
{
	if (n)
		neigh_release(n);
}

static void
E
Eric Dumazet 已提交
130
teql_reset(struct Qdisc *sch)
L
Linus Torvalds 已提交
131 132 133 134 135 136 137 138
{
	struct teql_sched_data *dat = qdisc_priv(sch);

	skb_queue_purge(&dat->q);
	sch->q.qlen = 0;
}

static void
E
Eric Dumazet 已提交
139
teql_destroy(struct Qdisc *sch)
L
Linus Torvalds 已提交
140 141 142 143 144
{
	struct Qdisc *q, *prev;
	struct teql_sched_data *dat = qdisc_priv(sch);
	struct teql_master *master = dat->m;

E
Eric Dumazet 已提交
145 146
	prev = master->slaves;
	if (prev) {
L
Linus Torvalds 已提交
147 148 149 150 151 152 153
		do {
			q = NEXT_SLAVE(prev);
			if (q == sch) {
				NEXT_SLAVE(prev) = NEXT_SLAVE(q);
				if (q == master->slaves) {
					master->slaves = NEXT_SLAVE(q);
					if (q == master->slaves) {
154
						struct netdev_queue *txq;
155
						spinlock_t *root_lock;
156 157

						txq = netdev_get_tx_queue(master->dev, 0);
L
Linus Torvalds 已提交
158
						master->slaves = NULL;
159

160
						root_lock = qdisc_root_sleeping_lock(txq->qdisc);
161
						spin_lock_bh(root_lock);
162
						qdisc_reset(txq->qdisc);
163
						spin_unlock_bh(root_lock);
L
Linus Torvalds 已提交
164 165 166 167 168
					}
				}
				skb_queue_purge(&dat->q);
				break;
			}
169

L
Linus Torvalds 已提交
170 171 172 173
		} while ((prev = q) != master->slaves);
	}
}

174
static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
L
Linus Torvalds 已提交
175
{
176
	struct net_device *dev = qdisc_dev(sch);
E
Eric Dumazet 已提交
177
	struct teql_master *m = (struct teql_master *)sch->ops;
L
Linus Torvalds 已提交
178 179 180 181 182 183 184 185 186 187 188 189 190 191
	struct teql_sched_data *q = qdisc_priv(sch);

	if (dev->hard_header_len > m->dev->hard_header_len)
		return -EINVAL;

	if (m->dev == dev)
		return -ELOOP;

	q->m = m;

	skb_queue_head_init(&q->q);

	if (m->slaves) {
		if (m->dev->flags & IFF_UP) {
192 193 194 195 196 197 198
			if ((m->dev->flags & IFF_POINTOPOINT &&
			     !(dev->flags & IFF_POINTOPOINT)) ||
			    (m->dev->flags & IFF_BROADCAST &&
			     !(dev->flags & IFF_BROADCAST)) ||
			    (m->dev->flags & IFF_MULTICAST &&
			     !(dev->flags & IFF_MULTICAST)) ||
			    dev->mtu < m->dev->mtu)
L
Linus Torvalds 已提交
199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222
				return -EINVAL;
		} else {
			if (!(dev->flags&IFF_POINTOPOINT))
				m->dev->flags &= ~IFF_POINTOPOINT;
			if (!(dev->flags&IFF_BROADCAST))
				m->dev->flags &= ~IFF_BROADCAST;
			if (!(dev->flags&IFF_MULTICAST))
				m->dev->flags &= ~IFF_MULTICAST;
			if (dev->mtu < m->dev->mtu)
				m->dev->mtu = dev->mtu;
		}
		q->next = NEXT_SLAVE(m->slaves);
		NEXT_SLAVE(m->slaves) = sch;
	} else {
		q->next = sch;
		m->slaves = sch;
		m->dev->mtu = dev->mtu;
		m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
	}
	return 0;
}


static int
E
Eric Dumazet 已提交
223 224
__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
	       struct net_device *dev, struct netdev_queue *txq,
225
	       struct dst_entry *dst)
L
Linus Torvalds 已提交
226
{
227 228
	struct neighbour *n;
	int err = 0;
L
Linus Torvalds 已提交
229

230 231 232 233 234 235 236 237 238 239 240 241
	n = dst_neigh_lookup_skb(dst, skb);
	if (!n)
		return -ENOENT;

	if (dst->dev != dev) {
		struct neighbour *mn;

		mn = __neigh_lookup_errno(n->tbl, n->primary_key, dev);
		neigh_release(n);
		if (IS_ERR(mn))
			return PTR_ERR(mn);
		n = mn;
L
Linus Torvalds 已提交
242
	}
243

L
Linus Torvalds 已提交
244 245
	if (neigh_event_send(n, skb_res) == 0) {
		int err;
246
		char haddr[MAX_ADDR_LEN];
247

248 249 250
		neigh_ha_snapshot(haddr, n, dev);
		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
				      NULL, skb->len);
251

252 253 254 255
		if (err < 0)
			err = -EINVAL;
	} else {
		err = (skb_res == NULL) ? -EAGAIN : 1;
L
Linus Torvalds 已提交
256 257
	}
	neigh_release(n);
258
	return err;
L
Linus Torvalds 已提交
259 260
}

261
static inline int teql_resolve(struct sk_buff *skb,
E
Eric Dumazet 已提交
262 263 264
			       struct sk_buff *skb_res,
			       struct net_device *dev,
			       struct netdev_queue *txq)
L
Linus Torvalds 已提交
265
{
E
Eric Dumazet 已提交
266 267 268
	struct dst_entry *dst = skb_dst(skb);
	int res;

269
	if (txq->qdisc == &noop_qdisc)
270 271
		return -ENODEV;

E
Eric Dumazet 已提交
272
	if (!dev->header_ops || !dst)
L
Linus Torvalds 已提交
273
		return 0;
E
Eric Dumazet 已提交
274 275

	rcu_read_lock();
276
	res = __teql_resolve(skb, skb_res, dev, txq, dst);
E
Eric Dumazet 已提交
277 278 279
	rcu_read_unlock();

	return res;
L
Linus Torvalds 已提交
280 281
}

282
static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
283
{
284
	struct teql_master *master = netdev_priv(dev);
L
Linus Torvalds 已提交
285 286 287
	struct Qdisc *start, *q;
	int busy;
	int nores;
288
	int subq = skb_get_queue_mapping(skb);
L
Linus Torvalds 已提交
289 290 291 292 293 294 295 296
	struct sk_buff *skb_res = NULL;

	start = master->slaves;

restart:
	nores = 0;
	busy = 0;

E
Eric Dumazet 已提交
297 298
	q = start;
	if (!q)
L
Linus Torvalds 已提交
299 300 301
		goto drop;

	do {
302
		struct net_device *slave = qdisc_dev(q);
303
		struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
304

305
		if (slave_txq->qdisc_sleeping != q)
L
Linus Torvalds 已提交
306
			continue;
307
		if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
308
		    !netif_running(slave)) {
L
Linus Torvalds 已提交
309 310 311 312
			busy = 1;
			continue;
		}

E
Eric Dumazet 已提交
313
		switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
L
Linus Torvalds 已提交
314
		case 0:
315
			if (__netif_tx_trylock(slave_txq)) {
316 317
				unsigned int length = qdisc_pkt_len(skb);

318
				if (!netif_xmit_frozen_or_stopped(slave_txq) &&
D
David S. Miller 已提交
319
				    netdev_start_xmit(skb, slave) == NETDEV_TX_OK) {
E
Eric Dumazet 已提交
320
					txq_trans_update(slave_txq);
321
					__netif_tx_unlock(slave_txq);
L
Linus Torvalds 已提交
322 323
					master->slaves = NEXT_SLAVE(q);
					netif_wake_queue(dev);
E
Eric Dumazet 已提交
324 325
					master->tx_packets++;
					master->tx_bytes += length;
326
					return NETDEV_TX_OK;
L
Linus Torvalds 已提交
327
				}
328
				__netif_tx_unlock(slave_txq);
L
Linus Torvalds 已提交
329
			}
330
			if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
L
Linus Torvalds 已提交
331 332 333 334
				busy = 1;
			break;
		case 1:
			master->slaves = NEXT_SLAVE(q);
335
			return NETDEV_TX_OK;
L
Linus Torvalds 已提交
336 337 338 339
		default:
			nores = 1;
			break;
		}
340
		__skb_pull(skb, skb_network_offset(skb));
L
Linus Torvalds 已提交
341 342 343 344 345 346 347 348 349
	} while ((q = NEXT_SLAVE(q)) != start);

	if (nores && skb_res == NULL) {
		skb_res = skb;
		goto restart;
	}

	if (busy) {
		netif_stop_queue(dev);
350
		return NETDEV_TX_BUSY;
L
Linus Torvalds 已提交
351
	}
E
Eric Dumazet 已提交
352
	master->tx_errors++;
L
Linus Torvalds 已提交
353 354

drop:
E
Eric Dumazet 已提交
355
	master->tx_dropped++;
L
Linus Torvalds 已提交
356
	dev_kfree_skb(skb);
357
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
358 359 360 361
}

static int teql_master_open(struct net_device *dev)
{
E
Eric Dumazet 已提交
362
	struct Qdisc *q;
363
	struct teql_master *m = netdev_priv(dev);
L
Linus Torvalds 已提交
364
	int mtu = 0xFFFE;
E
Eric Dumazet 已提交
365
	unsigned int flags = IFF_NOARP | IFF_MULTICAST;
L
Linus Torvalds 已提交
366 367 368 369 370 371 372 373

	if (m->slaves == NULL)
		return -EUNATCH;

	flags = FMASK;

	q = m->slaves;
	do {
374
		struct net_device *slave = qdisc_dev(q);
L
Linus Torvalds 已提交
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407

		if (slave == NULL)
			return -EUNATCH;

		if (slave->mtu < mtu)
			mtu = slave->mtu;
		if (slave->hard_header_len > LL_MAX_HEADER)
			return -EINVAL;

		/* If all the slaves are BROADCAST, master is BROADCAST
		   If all the slaves are PtP, master is PtP
		   Otherwise, master is NBMA.
		 */
		if (!(slave->flags&IFF_POINTOPOINT))
			flags &= ~IFF_POINTOPOINT;
		if (!(slave->flags&IFF_BROADCAST))
			flags &= ~IFF_BROADCAST;
		if (!(slave->flags&IFF_MULTICAST))
			flags &= ~IFF_MULTICAST;
	} while ((q = NEXT_SLAVE(q)) != m->slaves);

	m->dev->mtu = mtu;
	m->dev->flags = (m->dev->flags&~FMASK) | flags;
	netif_start_queue(m->dev);
	return 0;
}

static int teql_master_close(struct net_device *dev)
{
	netif_stop_queue(dev);
	return 0;
}

E
Eric Dumazet 已提交
408 409 410 411 412 413 414 415 416 417 418 419
static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
						     struct rtnl_link_stats64 *stats)
{
	struct teql_master *m = netdev_priv(dev);

	stats->tx_packets	= m->tx_packets;
	stats->tx_bytes		= m->tx_bytes;
	stats->tx_errors	= m->tx_errors;
	stats->tx_dropped	= m->tx_dropped;
	return stats;
}

L
Linus Torvalds 已提交
420 421
static int teql_master_mtu(struct net_device *dev, int new_mtu)
{
422
	struct teql_master *m = netdev_priv(dev);
L
Linus Torvalds 已提交
423 424 425 426 427 428 429 430
	struct Qdisc *q;

	if (new_mtu < 68)
		return -EINVAL;

	q = m->slaves;
	if (q) {
		do {
431
			if (new_mtu > qdisc_dev(q)->mtu)
L
Linus Torvalds 已提交
432
				return -EINVAL;
E
Eric Dumazet 已提交
433
		} while ((q = NEXT_SLAVE(q)) != m->slaves);
L
Linus Torvalds 已提交
434 435 436 437 438 439
	}

	dev->mtu = new_mtu;
	return 0;
}

440 441 442 443
static const struct net_device_ops teql_netdev_ops = {
	.ndo_open	= teql_master_open,
	.ndo_stop	= teql_master_close,
	.ndo_start_xmit	= teql_master_xmit,
E
Eric Dumazet 已提交
444
	.ndo_get_stats64 = teql_master_stats64,
445 446 447
	.ndo_change_mtu	= teql_master_mtu,
};

L
Linus Torvalds 已提交
448 449
static __init void teql_master_setup(struct net_device *dev)
{
450
	struct teql_master *master = netdev_priv(dev);
L
Linus Torvalds 已提交
451 452 453 454
	struct Qdisc_ops *ops = &master->qops;

	master->dev	= dev;
	ops->priv_size  = sizeof(struct teql_sched_data);
455

L
Linus Torvalds 已提交
456 457
	ops->enqueue	=	teql_enqueue;
	ops->dequeue	=	teql_dequeue;
458
	ops->peek	=	teql_peek;
L
Linus Torvalds 已提交
459 460 461 462 463
	ops->init	=	teql_qdisc_init;
	ops->reset	=	teql_reset;
	ops->destroy	=	teql_destroy;
	ops->owner	=	THIS_MODULE;

464
	dev->netdev_ops =       &teql_netdev_ops;
L
Linus Torvalds 已提交
465 466 467 468 469
	dev->type		= ARPHRD_VOID;
	dev->mtu		= 1500;
	dev->tx_queue_len	= 100;
	dev->flags		= IFF_NOARP;
	dev->hard_header_len	= LL_MAX_HEADER;
470
	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
L
Linus Torvalds 已提交
471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486
}

static LIST_HEAD(master_dev_list);
static int max_equalizers = 1;
module_param(max_equalizers, int, 0);
MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");

static int __init teql_init(void)
{
	int i;
	int err = -ENODEV;

	for (i = 0; i < max_equalizers; i++) {
		struct net_device *dev;
		struct teql_master *master;

487 488
		dev = alloc_netdev(sizeof(struct teql_master), "teql%d",
				   NET_NAME_UNKNOWN, teql_master_setup);
L
Linus Torvalds 已提交
489 490 491 492 493 494 495 496 497 498
		if (!dev) {
			err = -ENOMEM;
			break;
		}

		if ((err = register_netdev(dev))) {
			free_netdev(dev);
			break;
		}

499
		master = netdev_priv(dev);
L
Linus Torvalds 已提交
500 501 502 503 504 505 506 507 508 509 510 511 512 513 514

		strlcpy(master->qops.id, dev->name, IFNAMSIZ);
		err = register_qdisc(&master->qops);

		if (err) {
			unregister_netdev(dev);
			free_netdev(dev);
			break;
		}

		list_add_tail(&master->master_list, &master_dev_list);
	}
	return i ? 0 : err;
}

515
static void __exit teql_exit(void)
L
Linus Torvalds 已提交
516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532
{
	struct teql_master *master, *nxt;

	list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {

		list_del(&master->master_list);

		unregister_qdisc(&master->qops);
		unregister_netdev(master->dev);
		free_netdev(master->dev);
	}
}

module_init(teql_init);
module_exit(teql_exit);

MODULE_LICENSE("GPL");