sch_teql.c 12.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
/* net/sched/sch_teql.c	"True" (or "trivial") link equalizer.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
 *		as published by the Free Software Foundation; either version
 *		2 of the License, or (at your option) any later version.
 *
 * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
 */

#include <linux/module.h>
#include <linux/types.h>
#include <linux/kernel.h>
14
#include <linux/slab.h>
L
Linus Torvalds 已提交
15 16
#include <linux/string.h>
#include <linux/errno.h>
17
#include <linux/if_arp.h>
L
Linus Torvalds 已提交
18 19 20 21
#include <linux/netdevice.h>
#include <linux/init.h>
#include <linux/skbuff.h>
#include <linux/moduleparam.h>
22 23
#include <net/dst.h>
#include <net/neighbour.h>
L
Linus Torvalds 已提交
24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
#include <net/pkt_sched.h>

/*
   How to setup it.
   ----------------

   After loading this module you will find a new device teqlN
   and new qdisc with the same name. To join a slave to the equalizer
   you should just set this qdisc on a device f.e.

   # tc qdisc add dev eth0 root teql0
   # tc qdisc add dev eth1 root teql0

   That's all. Full PnP 8)

   Applicability.
   --------------

   1. Slave devices MUST be active devices, i.e., they must raise the tbusy
      signal and generate EOI events. If you want to equalize virtual devices
      like tunnels, use a normal eql device.
   2. This device puts no limitations on physical slave characteristics
      f.e. it will equalize 9600baud line and 100Mb ethernet perfectly :-)
      Certainly, large difference in link speeds will make the resulting
      eqalized link unusable, because of huge packet reordering.
      I estimate an upper useful difference as ~10 times.
   3. If the slave requires address resolution, only protocols using
      neighbour cache (IPv4/IPv6) will work over the equalized link.
      Other protocols are still allowed to use the slave device directly,
      which will not break load balancing, though native slave
      traffic will have the highest priority.  */

E
Eric Dumazet 已提交
56
struct teql_master {
L
Linus Torvalds 已提交
57 58 59 60
	struct Qdisc_ops qops;
	struct net_device *dev;
	struct Qdisc *slaves;
	struct list_head master_list;
E
Eric Dumazet 已提交
61 62 63 64
	unsigned long	tx_bytes;
	unsigned long	tx_packets;
	unsigned long	tx_errors;
	unsigned long	tx_dropped;
L
Linus Torvalds 已提交
65 66
};

E
Eric Dumazet 已提交
67
struct teql_sched_data {
L
Linus Torvalds 已提交
68 69 70 71 72 73
	struct Qdisc *next;
	struct teql_master *m;
	struct neighbour *ncache;
	struct sk_buff_head q;
};

E
Eric Dumazet 已提交
74
#define NEXT_SLAVE(q) (((struct teql_sched_data *)qdisc_priv(q))->next)
L
Linus Torvalds 已提交
75

E
Eric Dumazet 已提交
76
#define FMASK (IFF_BROADCAST | IFF_POINTOPOINT)
L
Linus Torvalds 已提交
77 78 79 80

/* "teql*" qdisc routines */

static int
E
Eric Dumazet 已提交
81
teql_enqueue(struct sk_buff *skb, struct Qdisc *sch)
L
Linus Torvalds 已提交
82
{
83
	struct net_device *dev = qdisc_dev(sch);
L
Linus Torvalds 已提交
84 85
	struct teql_sched_data *q = qdisc_priv(sch);

86 87
	if (q->q.qlen < dev->tx_queue_len) {
		__skb_queue_tail(&q->q, skb);
88
		return NET_XMIT_SUCCESS;
L
Linus Torvalds 已提交
89 90
	}

91
	return qdisc_drop(skb, sch);
L
Linus Torvalds 已提交
92 93 94
}

static struct sk_buff *
E
Eric Dumazet 已提交
95
teql_dequeue(struct Qdisc *sch)
L
Linus Torvalds 已提交
96 97
{
	struct teql_sched_data *dat = qdisc_priv(sch);
98
	struct netdev_queue *dat_queue;
L
Linus Torvalds 已提交
99 100 101
	struct sk_buff *skb;

	skb = __skb_dequeue(&dat->q);
102
	dat_queue = netdev_get_tx_queue(dat->m->dev, 0);
L
Linus Torvalds 已提交
103
	if (skb == NULL) {
104
		struct net_device *m = qdisc_dev(dat_queue->qdisc);
L
Linus Torvalds 已提交
105 106 107 108
		if (m) {
			dat->m->slaves = sch;
			netif_wake_queue(m);
		}
109 110
	} else {
		qdisc_bstats_update(sch, skb);
L
Linus Torvalds 已提交
111
	}
112
	sch->q.qlen = dat->q.qlen + dat_queue->qdisc->q.qlen;
L
Linus Torvalds 已提交
113 114 115
	return skb;
}

116
static struct sk_buff *
E
Eric Dumazet 已提交
117
teql_peek(struct Qdisc *sch)
118 119 120 121 122
{
	/* teql is meant to be used as root qdisc */
	return NULL;
}

E
Eric Dumazet 已提交
123
static inline void
L
Linus Torvalds 已提交
124 125 126 127 128 129 130
teql_neigh_release(struct neighbour *n)
{
	if (n)
		neigh_release(n);
}

static void
E
Eric Dumazet 已提交
131
teql_reset(struct Qdisc *sch)
L
Linus Torvalds 已提交
132 133 134 135 136 137 138 139 140
{
	struct teql_sched_data *dat = qdisc_priv(sch);

	skb_queue_purge(&dat->q);
	sch->q.qlen = 0;
	teql_neigh_release(xchg(&dat->ncache, NULL));
}

static void
E
Eric Dumazet 已提交
141
teql_destroy(struct Qdisc *sch)
L
Linus Torvalds 已提交
142 143 144 145 146
{
	struct Qdisc *q, *prev;
	struct teql_sched_data *dat = qdisc_priv(sch);
	struct teql_master *master = dat->m;

E
Eric Dumazet 已提交
147 148
	prev = master->slaves;
	if (prev) {
L
Linus Torvalds 已提交
149 150 151 152 153 154 155
		do {
			q = NEXT_SLAVE(prev);
			if (q == sch) {
				NEXT_SLAVE(prev) = NEXT_SLAVE(q);
				if (q == master->slaves) {
					master->slaves = NEXT_SLAVE(q);
					if (q == master->slaves) {
156
						struct netdev_queue *txq;
157
						spinlock_t *root_lock;
158 159

						txq = netdev_get_tx_queue(master->dev, 0);
L
Linus Torvalds 已提交
160
						master->slaves = NULL;
161

162
						root_lock = qdisc_root_sleeping_lock(txq->qdisc);
163
						spin_lock_bh(root_lock);
164
						qdisc_reset(txq->qdisc);
165
						spin_unlock_bh(root_lock);
L
Linus Torvalds 已提交
166 167 168 169 170 171
					}
				}
				skb_queue_purge(&dat->q);
				teql_neigh_release(xchg(&dat->ncache, NULL));
				break;
			}
172

L
Linus Torvalds 已提交
173 174 175 176
		} while ((prev = q) != master->slaves);
	}
}

177
static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
L
Linus Torvalds 已提交
178
{
179
	struct net_device *dev = qdisc_dev(sch);
E
Eric Dumazet 已提交
180
	struct teql_master *m = (struct teql_master *)sch->ops;
L
Linus Torvalds 已提交
181 182 183 184 185 186 187 188 189 190 191 192 193 194
	struct teql_sched_data *q = qdisc_priv(sch);

	if (dev->hard_header_len > m->dev->hard_header_len)
		return -EINVAL;

	if (m->dev == dev)
		return -ELOOP;

	q->m = m;

	skb_queue_head_init(&q->q);

	if (m->slaves) {
		if (m->dev->flags & IFF_UP) {
195 196 197 198 199 200 201
			if ((m->dev->flags & IFF_POINTOPOINT &&
			     !(dev->flags & IFF_POINTOPOINT)) ||
			    (m->dev->flags & IFF_BROADCAST &&
			     !(dev->flags & IFF_BROADCAST)) ||
			    (m->dev->flags & IFF_MULTICAST &&
			     !(dev->flags & IFF_MULTICAST)) ||
			    dev->mtu < m->dev->mtu)
L
Linus Torvalds 已提交
202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225
				return -EINVAL;
		} else {
			if (!(dev->flags&IFF_POINTOPOINT))
				m->dev->flags &= ~IFF_POINTOPOINT;
			if (!(dev->flags&IFF_BROADCAST))
				m->dev->flags &= ~IFF_BROADCAST;
			if (!(dev->flags&IFF_MULTICAST))
				m->dev->flags &= ~IFF_MULTICAST;
			if (dev->mtu < m->dev->mtu)
				m->dev->mtu = dev->mtu;
		}
		q->next = NEXT_SLAVE(m->slaves);
		NEXT_SLAVE(m->slaves) = sch;
	} else {
		q->next = sch;
		m->slaves = sch;
		m->dev->mtu = dev->mtu;
		m->dev->flags = (m->dev->flags&~FMASK)|(dev->flags&FMASK);
	}
	return 0;
}


static int
E
Eric Dumazet 已提交
226 227 228
__teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res,
	       struct net_device *dev, struct netdev_queue *txq,
	       struct neighbour *mn)
L
Linus Torvalds 已提交
229
{
E
Eric Dumazet 已提交
230
	struct teql_sched_data *q = qdisc_priv(txq->qdisc);
L
Linus Torvalds 已提交
231 232 233 234 235 236 237 238 239 240 241 242 243 244
	struct neighbour *n = q->ncache;

	if (mn->tbl == NULL)
		return -EINVAL;
	if (n && n->tbl == mn->tbl &&
	    memcmp(n->primary_key, mn->primary_key, mn->tbl->key_len) == 0) {
		atomic_inc(&n->refcnt);
	} else {
		n = __neigh_lookup_errno(mn->tbl, mn->primary_key, dev);
		if (IS_ERR(n))
			return PTR_ERR(n);
	}
	if (neigh_event_send(n, skb_res) == 0) {
		int err;
245
		char haddr[MAX_ADDR_LEN];
246

247 248 249
		neigh_ha_snapshot(haddr, n, dev);
		err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr,
				      NULL, skb->len);
250

L
Linus Torvalds 已提交
251 252 253 254 255 256 257 258 259 260 261
		if (err < 0) {
			neigh_release(n);
			return -EINVAL;
		}
		teql_neigh_release(xchg(&q->ncache, n));
		return 0;
	}
	neigh_release(n);
	return (skb_res == NULL) ? -EAGAIN : 1;
}

262
static inline int teql_resolve(struct sk_buff *skb,
E
Eric Dumazet 已提交
263 264 265
			       struct sk_buff *skb_res,
			       struct net_device *dev,
			       struct netdev_queue *txq)
L
Linus Torvalds 已提交
266
{
E
Eric Dumazet 已提交
267 268 269 270
	struct dst_entry *dst = skb_dst(skb);
	struct neighbour *mn;
	int res;

271
	if (txq->qdisc == &noop_qdisc)
272 273
		return -ENODEV;

E
Eric Dumazet 已提交
274
	if (!dev->header_ops || !dst)
L
Linus Torvalds 已提交
275
		return 0;
E
Eric Dumazet 已提交
276 277

	rcu_read_lock();
278
	mn = dst_get_neighbour_noref(dst);
E
Eric Dumazet 已提交
279 280 281 282
	res = mn ? __teql_resolve(skb, skb_res, dev, txq, mn) : 0;
	rcu_read_unlock();

	return res;
L
Linus Torvalds 已提交
283 284
}

285
static netdev_tx_t teql_master_xmit(struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
286
{
287
	struct teql_master *master = netdev_priv(dev);
L
Linus Torvalds 已提交
288 289 290
	struct Qdisc *start, *q;
	int busy;
	int nores;
291
	int subq = skb_get_queue_mapping(skb);
L
Linus Torvalds 已提交
292 293 294 295 296 297 298 299
	struct sk_buff *skb_res = NULL;

	start = master->slaves;

restart:
	nores = 0;
	busy = 0;

E
Eric Dumazet 已提交
300 301
	q = start;
	if (!q)
L
Linus Torvalds 已提交
302 303 304
		goto drop;

	do {
305
		struct net_device *slave = qdisc_dev(q);
306 307
		struct netdev_queue *slave_txq = netdev_get_tx_queue(slave, 0);
		const struct net_device_ops *slave_ops = slave->netdev_ops;
308

309
		if (slave_txq->qdisc_sleeping != q)
L
Linus Torvalds 已提交
310
			continue;
311
		if (netif_xmit_stopped(netdev_get_tx_queue(slave, subq)) ||
312
		    !netif_running(slave)) {
L
Linus Torvalds 已提交
313 314 315 316
			busy = 1;
			continue;
		}

E
Eric Dumazet 已提交
317
		switch (teql_resolve(skb, skb_res, slave, slave_txq)) {
L
Linus Torvalds 已提交
318
		case 0:
319
			if (__netif_tx_trylock(slave_txq)) {
320 321
				unsigned int length = qdisc_pkt_len(skb);

322
				if (!netif_xmit_frozen_or_stopped(slave_txq) &&
323
				    slave_ops->ndo_start_xmit(skb, slave) == NETDEV_TX_OK) {
E
Eric Dumazet 已提交
324
					txq_trans_update(slave_txq);
325
					__netif_tx_unlock(slave_txq);
L
Linus Torvalds 已提交
326 327
					master->slaves = NEXT_SLAVE(q);
					netif_wake_queue(dev);
E
Eric Dumazet 已提交
328 329
					master->tx_packets++;
					master->tx_bytes += length;
330
					return NETDEV_TX_OK;
L
Linus Torvalds 已提交
331
				}
332
				__netif_tx_unlock(slave_txq);
L
Linus Torvalds 已提交
333
			}
334
			if (netif_xmit_stopped(netdev_get_tx_queue(dev, 0)))
L
Linus Torvalds 已提交
335 336 337 338
				busy = 1;
			break;
		case 1:
			master->slaves = NEXT_SLAVE(q);
339
			return NETDEV_TX_OK;
L
Linus Torvalds 已提交
340 341 342 343
		default:
			nores = 1;
			break;
		}
344
		__skb_pull(skb, skb_network_offset(skb));
L
Linus Torvalds 已提交
345 346 347 348 349 350 351 352 353
	} while ((q = NEXT_SLAVE(q)) != start);

	if (nores && skb_res == NULL) {
		skb_res = skb;
		goto restart;
	}

	if (busy) {
		netif_stop_queue(dev);
354
		return NETDEV_TX_BUSY;
L
Linus Torvalds 已提交
355
	}
E
Eric Dumazet 已提交
356
	master->tx_errors++;
L
Linus Torvalds 已提交
357 358

drop:
E
Eric Dumazet 已提交
359
	master->tx_dropped++;
L
Linus Torvalds 已提交
360
	dev_kfree_skb(skb);
361
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
362 363 364 365
}

static int teql_master_open(struct net_device *dev)
{
E
Eric Dumazet 已提交
366
	struct Qdisc *q;
367
	struct teql_master *m = netdev_priv(dev);
L
Linus Torvalds 已提交
368
	int mtu = 0xFFFE;
E
Eric Dumazet 已提交
369
	unsigned int flags = IFF_NOARP | IFF_MULTICAST;
L
Linus Torvalds 已提交
370 371 372 373 374 375 376 377

	if (m->slaves == NULL)
		return -EUNATCH;

	flags = FMASK;

	q = m->slaves;
	do {
378
		struct net_device *slave = qdisc_dev(q);
L
Linus Torvalds 已提交
379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411

		if (slave == NULL)
			return -EUNATCH;

		if (slave->mtu < mtu)
			mtu = slave->mtu;
		if (slave->hard_header_len > LL_MAX_HEADER)
			return -EINVAL;

		/* If all the slaves are BROADCAST, master is BROADCAST
		   If all the slaves are PtP, master is PtP
		   Otherwise, master is NBMA.
		 */
		if (!(slave->flags&IFF_POINTOPOINT))
			flags &= ~IFF_POINTOPOINT;
		if (!(slave->flags&IFF_BROADCAST))
			flags &= ~IFF_BROADCAST;
		if (!(slave->flags&IFF_MULTICAST))
			flags &= ~IFF_MULTICAST;
	} while ((q = NEXT_SLAVE(q)) != m->slaves);

	m->dev->mtu = mtu;
	m->dev->flags = (m->dev->flags&~FMASK) | flags;
	netif_start_queue(m->dev);
	return 0;
}

static int teql_master_close(struct net_device *dev)
{
	netif_stop_queue(dev);
	return 0;
}

E
Eric Dumazet 已提交
412 413 414 415 416 417 418 419 420 421 422 423
static struct rtnl_link_stats64 *teql_master_stats64(struct net_device *dev,
						     struct rtnl_link_stats64 *stats)
{
	struct teql_master *m = netdev_priv(dev);

	stats->tx_packets	= m->tx_packets;
	stats->tx_bytes		= m->tx_bytes;
	stats->tx_errors	= m->tx_errors;
	stats->tx_dropped	= m->tx_dropped;
	return stats;
}

L
Linus Torvalds 已提交
424 425
static int teql_master_mtu(struct net_device *dev, int new_mtu)
{
426
	struct teql_master *m = netdev_priv(dev);
L
Linus Torvalds 已提交
427 428 429 430 431 432 433 434
	struct Qdisc *q;

	if (new_mtu < 68)
		return -EINVAL;

	q = m->slaves;
	if (q) {
		do {
435
			if (new_mtu > qdisc_dev(q)->mtu)
L
Linus Torvalds 已提交
436
				return -EINVAL;
E
Eric Dumazet 已提交
437
		} while ((q = NEXT_SLAVE(q)) != m->slaves);
L
Linus Torvalds 已提交
438 439 440 441 442 443
	}

	dev->mtu = new_mtu;
	return 0;
}

444 445 446 447
static const struct net_device_ops teql_netdev_ops = {
	.ndo_open	= teql_master_open,
	.ndo_stop	= teql_master_close,
	.ndo_start_xmit	= teql_master_xmit,
E
Eric Dumazet 已提交
448
	.ndo_get_stats64 = teql_master_stats64,
449 450 451
	.ndo_change_mtu	= teql_master_mtu,
};

L
Linus Torvalds 已提交
452 453
static __init void teql_master_setup(struct net_device *dev)
{
454
	struct teql_master *master = netdev_priv(dev);
L
Linus Torvalds 已提交
455 456 457 458
	struct Qdisc_ops *ops = &master->qops;

	master->dev	= dev;
	ops->priv_size  = sizeof(struct teql_sched_data);
459

L
Linus Torvalds 已提交
460 461
	ops->enqueue	=	teql_enqueue;
	ops->dequeue	=	teql_dequeue;
462
	ops->peek	=	teql_peek;
L
Linus Torvalds 已提交
463 464 465 466 467
	ops->init	=	teql_qdisc_init;
	ops->reset	=	teql_reset;
	ops->destroy	=	teql_destroy;
	ops->owner	=	THIS_MODULE;

468
	dev->netdev_ops =       &teql_netdev_ops;
L
Linus Torvalds 已提交
469 470 471 472 473
	dev->type		= ARPHRD_VOID;
	dev->mtu		= 1500;
	dev->tx_queue_len	= 100;
	dev->flags		= IFF_NOARP;
	dev->hard_header_len	= LL_MAX_HEADER;
474
	dev->priv_flags		&= ~IFF_XMIT_DST_RELEASE;
L
Linus Torvalds 已提交
475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502
}

static LIST_HEAD(master_dev_list);
static int max_equalizers = 1;
module_param(max_equalizers, int, 0);
MODULE_PARM_DESC(max_equalizers, "Max number of link equalizers");

static int __init teql_init(void)
{
	int i;
	int err = -ENODEV;

	for (i = 0; i < max_equalizers; i++) {
		struct net_device *dev;
		struct teql_master *master;

		dev = alloc_netdev(sizeof(struct teql_master),
				  "teql%d", teql_master_setup);
		if (!dev) {
			err = -ENOMEM;
			break;
		}

		if ((err = register_netdev(dev))) {
			free_netdev(dev);
			break;
		}

503
		master = netdev_priv(dev);
L
Linus Torvalds 已提交
504 505 506 507 508 509 510 511 512 513 514 515 516 517 518

		strlcpy(master->qops.id, dev->name, IFNAMSIZ);
		err = register_qdisc(&master->qops);

		if (err) {
			unregister_netdev(dev);
			free_netdev(dev);
			break;
		}

		list_add_tail(&master->master_list, &master_dev_list);
	}
	return i ? 0 : err;
}

519
static void __exit teql_exit(void)
L
Linus Torvalds 已提交
520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
{
	struct teql_master *master, *nxt;

	list_for_each_entry_safe(master, nxt, &master_dev_list, master_list) {

		list_del(&master->master_list);

		unregister_qdisc(&master->qops);
		unregister_netdev(master->dev);
		free_netdev(master->dev);
	}
}

module_init(teql_init);
module_exit(teql_exit);

MODULE_LICENSE("GPL");