ipmr.c 66.4 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 *	IP multicast routing support for mrouted 3.6/3.8
 *
4
 *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 *	  Linux Consultancy and Custom Driver Development
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 *
 *	Fixes:
 *	Michael Chastain	:	Incorrect size of copying.
 *	Alan Cox		:	Added the cache manager code
 *	Alan Cox		:	Fixed the clone/copy bug and device race.
 *	Mike McLagan		:	Routing by source
 *	Malcolm Beattie		:	Buffer handling fixes.
 *	Alexey Kuznetsov	:	Double buffer free and other fixes.
 *	SVR Anand		:	Fixed several multicast bugs and problems.
 *	Alexey Kuznetsov	:	Status, optimisations and more.
 *	Brad Parker		:	Better behaviour on mrouted upcall
 *					overflow.
 *      Carlos Picoto           :       PIMv1 Support
 *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
25
 *					Relax this requirement to work with older peers.
L
Linus Torvalds 已提交
26 27 28 29 30
 *
 */

#include <asm/uaccess.h>
#include <linux/types.h>
31
#include <linux/capability.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mroute.h>
#include <linux/init.h>
48
#include <linux/if_ether.h>
49
#include <linux/slab.h>
50
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
51 52 53
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
54
#include <net/route.h>
L
Linus Torvalds 已提交
55 56 57 58 59 60 61
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
62
#include <linux/compat.h>
63
#include <linux/export.h>
64
#include <net/ip_tunnels.h>
L
Linus Torvalds 已提交
65
#include <net/checksum.h>
66
#include <net/netlink.h>
67
#include <net/fib_rules.h>
68
#include <linux/netconf.h>
69
#include <net/nexthop.h>
L
Linus Torvalds 已提交
70

71 72 73 74 75 76 77 78
struct ipmr_rule {
	struct fib_rule		common;
};

struct ipmr_result {
	struct mr_table		*mrt;
};

L
Linus Torvalds 已提交
79
/* Big lock, protecting vif table, mrt cache and mroute socket state.
E
Eric Dumazet 已提交
80
 * Note that the changes are semaphored via rtnl_lock.
L
Linus Torvalds 已提交
81 82 83 84
 */

static DEFINE_RWLOCK(mrt_lock);

85
/* Multicast router control variables */
L
Linus Torvalds 已提交
86 87 88 89 90

/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);

/* We return to original Alan's scheme. Hash table of resolved
E
Eric Dumazet 已提交
91 92 93 94 95
 * entries is changed only in process context and protected
 * with weak lock mrt_lock. Queue of unresolved entries is protected
 * with strong spinlock mfc_unres_lock.
 *
 * In this case data path is free of exclusive locks at all.
L
Linus Torvalds 已提交
96 97
 */

98
static struct kmem_cache *mrt_cachep __read_mostly;
L
Linus Torvalds 已提交
99

100
static struct mr_table *ipmr_new_table(struct net *net, u32 id);
101 102
static void ipmr_free_table(struct mr_table *mrt);

103 104 105
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
			  struct sk_buff *skb, struct mfc_cache *cache,
			  int local);
106
static int ipmr_cache_report(struct mr_table *mrt,
107
			     struct sk_buff *pkt, vifi_t vifi, int assert);
108 109
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
			      struct mfc_cache *c, struct rtmsg *rtm);
110 111
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
				 int cmd);
112
static void mroute_clean_tables(struct mr_table *mrt, bool all);
113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129
static void ipmr_expire_process(unsigned long arg);

#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
#define ipmr_for_each_table(mrt, net) \
	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)

static struct mr_table *ipmr_get_table(struct net *net, u32 id)
{
	struct mr_table *mrt;

	ipmr_for_each_table(mrt, net) {
		if (mrt->id == id)
			return mrt;
	}
	return NULL;
}

D
David S. Miller 已提交
130
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
131 132 133
			   struct mr_table **mrt)
{
	int err;
134 135 136 137 138
	struct ipmr_result res;
	struct fib_lookup_arg arg = {
		.result = &res,
		.flags = FIB_LOOKUP_NOREF,
	};
139

D
David S. Miller 已提交
140 141
	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
			       flowi4_to_flowi(flp4), 0, &arg);
142 143 144 145 146 147 148 149 150 151 152
	if (err < 0)
		return err;
	*mrt = res.mrt;
	return 0;
}

static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
			    int flags, struct fib_lookup_arg *arg)
{
	struct ipmr_result *res = arg->result;
	struct mr_table *mrt;
L
Linus Torvalds 已提交
153

154 155 156 157 158 159 160 161 162 163 164 165 166
	switch (rule->action) {
	case FR_ACT_TO_TBL:
		break;
	case FR_ACT_UNREACHABLE:
		return -ENETUNREACH;
	case FR_ACT_PROHIBIT:
		return -EACCES;
	case FR_ACT_BLACKHOLE:
	default:
		return -EINVAL;
	}

	mrt = ipmr_get_table(rule->fr_net, rule->table);
167
	if (!mrt)
168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
		return -EAGAIN;
	res->mrt = mrt;
	return 0;
}

static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
	return 1;
}

static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
	FRA_GENERIC_POLICY,
};

static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
			       struct fib_rule_hdr *frh, struct nlattr **tb)
{
	return 0;
}

static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
			     struct nlattr **tb)
{
	return 1;
}

static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
			  struct fib_rule_hdr *frh)
{
	frh->dst_len = 0;
	frh->src_len = 0;
	frh->tos     = 0;
	return 0;
}

203
static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
204
	.family		= RTNL_FAMILY_IPMR,
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229
	.rule_size	= sizeof(struct ipmr_rule),
	.addr_size	= sizeof(u32),
	.action		= ipmr_rule_action,
	.match		= ipmr_rule_match,
	.configure	= ipmr_rule_configure,
	.compare	= ipmr_rule_compare,
	.fill		= ipmr_rule_fill,
	.nlgroup	= RTNLGRP_IPV4_RULE,
	.policy		= ipmr_rule_policy,
	.owner		= THIS_MODULE,
};

static int __net_init ipmr_rules_init(struct net *net)
{
	struct fib_rules_ops *ops;
	struct mr_table *mrt;
	int err;

	ops = fib_rules_register(&ipmr_rules_ops_template, net);
	if (IS_ERR(ops))
		return PTR_ERR(ops);

	INIT_LIST_HEAD(&net->ipv4.mr_tables);

	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
230 231
	if (IS_ERR(mrt)) {
		err = PTR_ERR(mrt);
232 233 234 235 236 237 238 239 240 241 242
		goto err1;
	}

	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
	if (err < 0)
		goto err2;

	net->ipv4.mr_rules_ops = ops;
	return 0;

err2:
243
	ipmr_free_table(mrt);
244 245 246 247 248 249 250 251 252
err1:
	fib_rules_unregister(ops);
	return err;
}

static void __net_exit ipmr_rules_exit(struct net *net)
{
	struct mr_table *mrt, *next;

253
	rtnl_lock();
E
Eric Dumazet 已提交
254 255
	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
		list_del(&mrt->list);
256
		ipmr_free_table(mrt);
E
Eric Dumazet 已提交
257
	}
258
	fib_rules_unregister(net->ipv4.mr_rules_ops);
259
	rtnl_unlock();
260 261 262 263 264 265 266 267 268 269
}
#else
#define ipmr_for_each_table(mrt, net) \
	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)

static struct mr_table *ipmr_get_table(struct net *net, u32 id)
{
	return net->ipv4.mrt;
}

D
David S. Miller 已提交
270
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
271 272 273 274 275 276 277 278
			   struct mr_table **mrt)
{
	*mrt = net->ipv4.mrt;
	return 0;
}

static int __net_init ipmr_rules_init(struct net *net)
{
279 280 281 282 283 284 285
	struct mr_table *mrt;

	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
	if (IS_ERR(mrt))
		return PTR_ERR(mrt);
	net->ipv4.mrt = mrt;
	return 0;
286 287 288 289
}

static void __net_exit ipmr_rules_exit(struct net *net)
{
290
	rtnl_lock();
291
	ipmr_free_table(net->ipv4.mrt);
292 293
	net->ipv4.mrt = NULL;
	rtnl_unlock();
294 295 296 297 298 299 300
}
#endif

static struct mr_table *ipmr_new_table(struct net *net, u32 id)
{
	struct mr_table *mrt;
	unsigned int i;
L
Linus Torvalds 已提交
301

302 303 304 305
	/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
	if (id != RT_TABLE_DEFAULT && id >= 1000000000)
		return ERR_PTR(-EINVAL);

306
	mrt = ipmr_get_table(net, id);
307
	if (mrt)
308 309 310
		return mrt;

	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
311
	if (!mrt)
312
		return ERR_PTR(-ENOMEM);
313
	write_pnet(&mrt->net, net);
314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330
	mrt->id = id;

	/* Forwarding cache */
	for (i = 0; i < MFC_LINES; i++)
		INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);

	INIT_LIST_HEAD(&mrt->mfc_unres_queue);

	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
		    (unsigned long)mrt);

	mrt->mroute_reg_vif_num = -1;
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
#endif
	return mrt;
}
L
Linus Torvalds 已提交
331

332 333 334
static void ipmr_free_table(struct mr_table *mrt)
{
	del_timer_sync(&mrt->ipmr_expire_timer);
335
	mroute_clean_tables(mrt, true);
336 337 338
	kfree(mrt);
}

L
Linus Torvalds 已提交
339 340
/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */

341 342
static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
{
343 344
	struct net *net = dev_net(dev);

345 346
	dev_close(dev);

347
	dev = __dev_get_by_name(net, "tunl0");
348
	if (dev) {
349
		const struct net_device_ops *ops = dev->netdev_ops;
350 351 352 353 354 355 356 357 358 359 360 361
		struct ifreq ifr;
		struct ip_tunnel_parm p;

		memset(&p, 0, sizeof(p));
		p.iph.daddr = v->vifc_rmt_addr.s_addr;
		p.iph.saddr = v->vifc_lcl_addr.s_addr;
		p.iph.version = 4;
		p.iph.ihl = 5;
		p.iph.protocol = IPPROTO_IPIP;
		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;

362 363 364 365 366 367 368
		if (ops->ndo_do_ioctl) {
			mm_segment_t oldfs = get_fs();

			set_fs(KERNEL_DS);
			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
			set_fs(oldfs);
		}
369 370 371
	}
}

372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388
/* Initialize ipmr pimreg/tunnel in_device */
static bool ipmr_init_vif_indev(const struct net_device *dev)
{
	struct in_device *in_dev;

	ASSERT_RTNL();

	in_dev = __in_dev_get_rtnl(dev);
	if (!in_dev)
		return false;
	ipv4_devconf_setall(in_dev);
	neigh_parms_data_state_setall(in_dev->arp_parms);
	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;

	return true;
}

389
static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
L
Linus Torvalds 已提交
390 391 392
{
	struct net_device  *dev;

393
	dev = __dev_get_by_name(net, "tunl0");
L
Linus Torvalds 已提交
394 395

	if (dev) {
396
		const struct net_device_ops *ops = dev->netdev_ops;
L
Linus Torvalds 已提交
397 398 399 400 401 402 403 404 405 406 407
		int err;
		struct ifreq ifr;
		struct ip_tunnel_parm p;

		memset(&p, 0, sizeof(p));
		p.iph.daddr = v->vifc_rmt_addr.s_addr;
		p.iph.saddr = v->vifc_lcl_addr.s_addr;
		p.iph.version = 4;
		p.iph.ihl = 5;
		p.iph.protocol = IPPROTO_IPIP;
		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
S
Stephen Hemminger 已提交
408
		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
L
Linus Torvalds 已提交
409

410 411 412 413 414 415
		if (ops->ndo_do_ioctl) {
			mm_segment_t oldfs = get_fs();

			set_fs(KERNEL_DS);
			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
			set_fs(oldfs);
E
Eric Dumazet 已提交
416
		} else {
417
			err = -EOPNOTSUPP;
E
Eric Dumazet 已提交
418
		}
L
Linus Torvalds 已提交
419 420
		dev = NULL;

421 422
		if (err == 0 &&
		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
L
Linus Torvalds 已提交
423
			dev->flags |= IFF_MULTICAST;
424
			if (!ipmr_init_vif_indev(dev))
L
Linus Torvalds 已提交
425 426 427
				goto failure;
			if (dev_open(dev))
				goto failure;
428
			dev_hold(dev);
L
Linus Torvalds 已提交
429 430 431 432 433 434 435 436 437
		}
	}
	return dev;

failure:
	unregister_netdevice(dev);
	return NULL;
}

438
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
439
static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
440
{
441
	struct net *net = dev_net(dev);
442
	struct mr_table *mrt;
D
David S. Miller 已提交
443 444
	struct flowi4 fl4 = {
		.flowi4_oif	= dev->ifindex,
445
		.flowi4_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,
D
David S. Miller 已提交
446
		.flowi4_mark	= skb->mark,
447 448 449
	};
	int err;

D
David S. Miller 已提交
450
	err = ipmr_fib_lookup(net, &fl4, &mrt);
451 452
	if (err < 0) {
		kfree_skb(skb);
453
		return err;
454
	}
455

L
Linus Torvalds 已提交
456
	read_lock(&mrt_lock);
457 458
	dev->stats.tx_bytes += skb->len;
	dev->stats.tx_packets++;
459
	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
L
Linus Torvalds 已提交
460 461
	read_unlock(&mrt_lock);
	kfree_skb(skb);
462
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
463 464
}

465 466 467 468 469
static int reg_vif_get_iflink(const struct net_device *dev)
{
	return 0;
}

470 471
static const struct net_device_ops reg_vif_netdev_ops = {
	.ndo_start_xmit	= reg_vif_xmit,
472
	.ndo_get_iflink = reg_vif_get_iflink,
473 474
};

L
Linus Torvalds 已提交
475 476 477
static void reg_vif_setup(struct net_device *dev)
{
	dev->type		= ARPHRD_PIMREG;
478
	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
L
Linus Torvalds 已提交
479
	dev->flags		= IFF_NOARP;
480
	dev->netdev_ops		= &reg_vif_netdev_ops;
L
Linus Torvalds 已提交
481
	dev->destructor		= free_netdev;
T
Tom Goff 已提交
482
	dev->features		|= NETIF_F_NETNS_LOCAL;
L
Linus Torvalds 已提交
483 484
}

485
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
L
Linus Torvalds 已提交
486 487
{
	struct net_device *dev;
488
	char name[IFNAMSIZ];
L
Linus Torvalds 已提交
489

490 491 492 493
	if (mrt->id == RT_TABLE_DEFAULT)
		sprintf(name, "pimreg");
	else
		sprintf(name, "pimreg%u", mrt->id);
L
Linus Torvalds 已提交
494

495
	dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
L
Linus Torvalds 已提交
496

497
	if (!dev)
L
Linus Torvalds 已提交
498 499
		return NULL;

T
Tom Goff 已提交
500 501
	dev_net_set(dev, net);

L
Linus Torvalds 已提交
502 503 504 505 506
	if (register_netdevice(dev)) {
		free_netdev(dev);
		return NULL;
	}

507
	if (!ipmr_init_vif_indev(dev))
L
Linus Torvalds 已提交
508 509 510 511
		goto failure;
	if (dev_open(dev))
		goto failure;

512 513
	dev_hold(dev);

L
Linus Torvalds 已提交
514 515 516 517 518 519
	return dev;

failure:
	unregister_netdevice(dev);
	return NULL;
}
520 521 522 523 524 525 526 527 528

/* called with rcu_read_lock() */
static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
		     unsigned int pimlen)
{
	struct net_device *reg_dev = NULL;
	struct iphdr *encap;

	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
529
	/* Check that:
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563
	 * a. packet is really sent to a multicast group
	 * b. packet is not a NULL-REGISTER
	 * c. packet is not truncated
	 */
	if (!ipv4_is_multicast(encap->daddr) ||
	    encap->tot_len == 0 ||
	    ntohs(encap->tot_len) + pimlen > skb->len)
		return 1;

	read_lock(&mrt_lock);
	if (mrt->mroute_reg_vif_num >= 0)
		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
	read_unlock(&mrt_lock);

	if (!reg_dev)
		return 1;

	skb->mac_header = skb->network_header;
	skb_pull(skb, (u8 *)encap - skb->data);
	skb_reset_network_header(skb);
	skb->protocol = htons(ETH_P_IP);
	skb->ip_summed = CHECKSUM_NONE;

	skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));

	netif_rx(skb);

	return NET_RX_SUCCESS;
}
#else
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
	return NULL;
}
L
Linus Torvalds 已提交
564 565
#endif

566 567
/**
 *	vif_delete - Delete a VIF entry
568
 *	@notify: Set to 1, if the caller is a notifier_call
L
Linus Torvalds 已提交
569
 */
570
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
571
		      struct list_head *head)
L
Linus Torvalds 已提交
572 573 574 575 576
{
	struct vif_device *v;
	struct net_device *dev;
	struct in_device *in_dev;

577
	if (vifi < 0 || vifi >= mrt->maxvif)
L
Linus Torvalds 已提交
578 579
		return -EADDRNOTAVAIL;

580
	v = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
581 582 583 584 585 586 587 588 589 590

	write_lock_bh(&mrt_lock);
	dev = v->dev;
	v->dev = NULL;

	if (!dev) {
		write_unlock_bh(&mrt_lock);
		return -EADDRNOTAVAIL;
	}

591 592
	if (vifi == mrt->mroute_reg_vif_num)
		mrt->mroute_reg_vif_num = -1;
L
Linus Torvalds 已提交
593

E
Eric Dumazet 已提交
594
	if (vifi + 1 == mrt->maxvif) {
L
Linus Torvalds 已提交
595
		int tmp;
E
Eric Dumazet 已提交
596 597

		for (tmp = vifi - 1; tmp >= 0; tmp--) {
598
			if (VIF_EXISTS(mrt, tmp))
L
Linus Torvalds 已提交
599 600
				break;
		}
601
		mrt->maxvif = tmp+1;
L
Linus Torvalds 已提交
602 603 604 605 606 607
	}

	write_unlock_bh(&mrt_lock);

	dev_set_allmulti(dev, -1);

E
Eric Dumazet 已提交
608 609
	in_dev = __in_dev_get_rtnl(dev);
	if (in_dev) {
610
		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
611 612 613
		inet_netconf_notify_devconf(dev_net(dev),
					    NETCONFA_MC_FORWARDING,
					    dev->ifindex, &in_dev->cnf);
L
Linus Torvalds 已提交
614 615 616
		ip_rt_multicast_event(in_dev);
	}

E
Eric Dumazet 已提交
617
	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
618
		unregister_netdevice_queue(dev, head);
L
Linus Torvalds 已提交
619 620 621 622 623

	dev_put(dev);
	return 0;
}

624
static void ipmr_cache_free_rcu(struct rcu_head *head)
625
{
626 627
	struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);

628 629 630
	kmem_cache_free(mrt_cachep, c);
}

631 632 633 634 635
static inline void ipmr_cache_free(struct mfc_cache *c)
{
	call_rcu(&c->rcu, ipmr_cache_free_rcu);
}

L
Linus Torvalds 已提交
636
/* Destroy an unresolved cache entry, killing queued skbs
E
Eric Dumazet 已提交
637
 * and reporting error to netlink readers.
L
Linus Torvalds 已提交
638
 */
639
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
L
Linus Torvalds 已提交
640
{
641
	struct net *net = read_pnet(&mrt->net);
L
Linus Torvalds 已提交
642
	struct sk_buff *skb;
643
	struct nlmsgerr *e;
L
Linus Torvalds 已提交
644

645
	atomic_dec(&mrt->cache_resolve_queue_len);
L
Linus Torvalds 已提交
646

J
Jianjun Kong 已提交
647
	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
648
		if (ip_hdr(skb)->version == 0) {
L
Linus Torvalds 已提交
649 650
			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
			nlh->nlmsg_type = NLMSG_ERROR;
651
			nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
L
Linus Torvalds 已提交
652
			skb_trim(skb, nlh->nlmsg_len);
653
			e = nlmsg_data(nlh);
654 655
			e->error = -ETIMEDOUT;
			memset(&e->msg, 0, sizeof(e->msg));
656

657
			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
E
Eric Dumazet 已提交
658
		} else {
L
Linus Torvalds 已提交
659
			kfree_skb(skb);
E
Eric Dumazet 已提交
660
		}
L
Linus Torvalds 已提交
661 662
	}

663
	ipmr_cache_free(c);
L
Linus Torvalds 已提交
664 665
}

666 667
/* Timer process for the unresolved queue. */
static void ipmr_expire_process(unsigned long arg)
L
Linus Torvalds 已提交
668
{
669
	struct mr_table *mrt = (struct mr_table *)arg;
L
Linus Torvalds 已提交
670 671
	unsigned long now;
	unsigned long expires;
672
	struct mfc_cache *c, *next;
L
Linus Torvalds 已提交
673 674

	if (!spin_trylock(&mfc_unres_lock)) {
675
		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
L
Linus Torvalds 已提交
676 677 678
		return;
	}

679
	if (list_empty(&mrt->mfc_unres_queue))
L
Linus Torvalds 已提交
680 681 682 683 684
		goto out;

	now = jiffies;
	expires = 10*HZ;

685
	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
L
Linus Torvalds 已提交
686 687 688 689 690 691 692
		if (time_after(c->mfc_un.unres.expires, now)) {
			unsigned long interval = c->mfc_un.unres.expires - now;
			if (interval < expires)
				expires = interval;
			continue;
		}

693
		list_del(&c->list);
694
		mroute_netlink_event(mrt, c, RTM_DELROUTE);
695
		ipmr_destroy_unres(mrt, c);
L
Linus Torvalds 已提交
696 697
	}

698 699
	if (!list_empty(&mrt->mfc_unres_queue))
		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
L
Linus Torvalds 已提交
700 701 702 703 704 705

out:
	spin_unlock(&mfc_unres_lock);
}

/* Fill oifs list. It is called under write locked mrt_lock. */
706
static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
707
				   unsigned char *ttls)
L
Linus Torvalds 已提交
708 709 710 711 712 713 714
{
	int vifi;

	cache->mfc_un.res.minvif = MAXVIFS;
	cache->mfc_un.res.maxvif = 0;
	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);

715 716
	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
		if (VIF_EXISTS(mrt, vifi) &&
717
		    ttls[vifi] && ttls[vifi] < 255) {
L
Linus Torvalds 已提交
718 719 720 721 722 723 724 725 726
			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
			if (cache->mfc_un.res.minvif > vifi)
				cache->mfc_un.res.minvif = vifi;
			if (cache->mfc_un.res.maxvif <= vifi)
				cache->mfc_un.res.maxvif = vifi + 1;
		}
	}
}

727 728
static int vif_add(struct net *net, struct mr_table *mrt,
		   struct vifctl *vifc, int mrtsock)
L
Linus Torvalds 已提交
729 730
{
	int vifi = vifc->vifc_vifi;
731
	struct vif_device *v = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
732 733
	struct net_device *dev;
	struct in_device *in_dev;
734
	int err;
L
Linus Torvalds 已提交
735 736

	/* Is vif busy ? */
737
	if (VIF_EXISTS(mrt, vifi))
L
Linus Torvalds 已提交
738 739 740 741
		return -EADDRINUSE;

	switch (vifc->vifc_flags) {
	case VIFF_REGISTER:
742
		if (!ipmr_pimsm_enabled())
743 744
			return -EINVAL;
		/* Special Purpose VIF in PIM
L
Linus Torvalds 已提交
745 746
		 * All the packets will be sent to the daemon
		 */
747
		if (mrt->mroute_reg_vif_num >= 0)
L
Linus Torvalds 已提交
748
			return -EADDRINUSE;
749
		dev = ipmr_reg_vif(net, mrt);
L
Linus Torvalds 已提交
750 751
		if (!dev)
			return -ENOBUFS;
752 753 754
		err = dev_set_allmulti(dev, 1);
		if (err) {
			unregister_netdevice(dev);
755
			dev_put(dev);
756 757
			return err;
		}
L
Linus Torvalds 已提交
758
		break;
759
	case VIFF_TUNNEL:
760
		dev = ipmr_new_tunnel(net, vifc);
L
Linus Torvalds 已提交
761 762
		if (!dev)
			return -ENOBUFS;
763 764 765
		err = dev_set_allmulti(dev, 1);
		if (err) {
			ipmr_del_tunnel(dev, vifc);
766
			dev_put(dev);
767 768
			return err;
		}
L
Linus Torvalds 已提交
769
		break;
770
	case VIFF_USE_IFINDEX:
L
Linus Torvalds 已提交
771
	case 0:
772 773
		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
774
			if (dev && !__in_dev_get_rtnl(dev)) {
775 776 777
				dev_put(dev);
				return -EADDRNOTAVAIL;
			}
E
Eric Dumazet 已提交
778
		} else {
779
			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
E
Eric Dumazet 已提交
780
		}
L
Linus Torvalds 已提交
781 782
		if (!dev)
			return -EADDRNOTAVAIL;
783
		err = dev_set_allmulti(dev, 1);
784 785
		if (err) {
			dev_put(dev);
786
			return err;
787
		}
L
Linus Torvalds 已提交
788 789 790 791 792
		break;
	default:
		return -EINVAL;
	}

E
Eric Dumazet 已提交
793 794
	in_dev = __in_dev_get_rtnl(dev);
	if (!in_dev) {
795
		dev_put(dev);
L
Linus Torvalds 已提交
796
		return -EADDRNOTAVAIL;
797
	}
798
	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
799 800
	inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING, dev->ifindex,
				    &in_dev->cnf);
L
Linus Torvalds 已提交
801 802
	ip_rt_multicast_event(in_dev);

E
Eric Dumazet 已提交
803 804
	/* Fill in the VIF structures */

J
Jianjun Kong 已提交
805 806 807 808
	v->rate_limit = vifc->vifc_rate_limit;
	v->local = vifc->vifc_lcl_addr.s_addr;
	v->remote = vifc->vifc_rmt_addr.s_addr;
	v->flags = vifc->vifc_flags;
L
Linus Torvalds 已提交
809 810
	if (!mrtsock)
		v->flags |= VIFF_STATIC;
J
Jianjun Kong 已提交
811
	v->threshold = vifc->vifc_threshold;
L
Linus Torvalds 已提交
812 813 814 815 816
	v->bytes_in = 0;
	v->bytes_out = 0;
	v->pkt_in = 0;
	v->pkt_out = 0;
	v->link = dev->ifindex;
E
Eric Dumazet 已提交
817
	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
818
		v->link = dev_get_iflink(dev);
L
Linus Torvalds 已提交
819 820 821

	/* And finish update writing critical data */
	write_lock_bh(&mrt_lock);
J
Jianjun Kong 已提交
822
	v->dev = dev;
E
Eric Dumazet 已提交
823
	if (v->flags & VIFF_REGISTER)
824 825 826
		mrt->mroute_reg_vif_num = vifi;
	if (vifi+1 > mrt->maxvif)
		mrt->maxvif = vifi+1;
L
Linus Torvalds 已提交
827 828 829 830
	write_unlock_bh(&mrt_lock);
	return 0;
}

831
/* called with rcu_read_lock() */
832
static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
833 834
					 __be32 origin,
					 __be32 mcastgrp)
L
Linus Torvalds 已提交
835
{
J
Jianjun Kong 已提交
836
	int line = MFC_HASH(mcastgrp, origin);
L
Linus Torvalds 已提交
837 838
	struct mfc_cache *c;

839
	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) {
840 841
		if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
			return c;
L
Linus Torvalds 已提交
842
	}
843
	return NULL;
L
Linus Torvalds 已提交
844 845
}

846 847 848 849
/* Look for a (*,*,oif) entry */
static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
						    int vifi)
{
850
	int line = MFC_HASH(htonl(INADDR_ANY), htonl(INADDR_ANY));
851 852 853
	struct mfc_cache *c;

	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
854 855
		if (c->mfc_origin == htonl(INADDR_ANY) &&
		    c->mfc_mcastgrp == htonl(INADDR_ANY) &&
856 857 858 859 860 861 862 863 864 865
		    c->mfc_un.res.ttls[vifi] < 255)
			return c;

	return NULL;
}

/* Look for a (*,G) entry */
static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
					     __be32 mcastgrp, int vifi)
{
866
	int line = MFC_HASH(mcastgrp, htonl(INADDR_ANY));
867 868
	struct mfc_cache *c, *proxy;

869
	if (mcastgrp == htonl(INADDR_ANY))
870 871 872
		goto skip;

	list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list)
873
		if (c->mfc_origin == htonl(INADDR_ANY) &&
874 875 876 877 878 879 880 881 882 883 884 885 886 887 888
		    c->mfc_mcastgrp == mcastgrp) {
			if (c->mfc_un.res.ttls[vifi] < 255)
				return c;

			/* It's ok if the vifi is part of the static tree */
			proxy = ipmr_cache_find_any_parent(mrt,
							   c->mfc_parent);
			if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
				return c;
		}

skip:
	return ipmr_cache_find_any_parent(mrt, vifi);
}

889
/* Allocate a multicast cache entry */
890
static struct mfc_cache *ipmr_cache_alloc(void)
L
Linus Torvalds 已提交
891
{
J
Jianjun Kong 已提交
892
	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
893

894 895
	if (c) {
		c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
896
		c->mfc_un.res.minvif = MAXVIFS;
897
	}
L
Linus Torvalds 已提交
898 899 900
	return c;
}

901
static struct mfc_cache *ipmr_cache_alloc_unres(void)
L
Linus Torvalds 已提交
902
{
J
Jianjun Kong 已提交
903
	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
904 905 906 907 908

	if (c) {
		skb_queue_head_init(&c->mfc_un.unres.unresolved);
		c->mfc_un.unres.expires = jiffies + 10*HZ;
	}
L
Linus Torvalds 已提交
909 910 911
	return c;
}

912
/* A cache entry has gone into a resolved state from queued */
913 914
static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
			       struct mfc_cache *uc, struct mfc_cache *c)
L
Linus Torvalds 已提交
915 916
{
	struct sk_buff *skb;
917
	struct nlmsgerr *e;
L
Linus Torvalds 已提交
918

E
Eric Dumazet 已提交
919
	/* Play the pending entries through our router */
J
Jianjun Kong 已提交
920
	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
921
		if (ip_hdr(skb)->version == 0) {
L
Linus Torvalds 已提交
922 923
			struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));

924
			if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
E
Eric Dumazet 已提交
925 926
				nlh->nlmsg_len = skb_tail_pointer(skb) -
						 (u8 *)nlh;
L
Linus Torvalds 已提交
927 928
			} else {
				nlh->nlmsg_type = NLMSG_ERROR;
929
				nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
L
Linus Torvalds 已提交
930
				skb_trim(skb, nlh->nlmsg_len);
931
				e = nlmsg_data(nlh);
932 933
				e->error = -EMSGSIZE;
				memset(&e->msg, 0, sizeof(e->msg));
L
Linus Torvalds 已提交
934
			}
935

936
			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
E
Eric Dumazet 已提交
937
		} else {
938
			ip_mr_forward(net, mrt, skb, c, 0);
E
Eric Dumazet 已提交
939
		}
L
Linus Torvalds 已提交
940 941 942
	}
}

943 944
/* Bounce a cache query up to mrouted. We could use netlink for this but mrouted
 * expects the following bizarre scheme.
L
Linus Torvalds 已提交
945
 *
946
 * Called under mrt_lock.
L
Linus Torvalds 已提交
947
 */
948
static int ipmr_cache_report(struct mr_table *mrt,
949
			     struct sk_buff *pkt, vifi_t vifi, int assert)
L
Linus Torvalds 已提交
950
{
951
	const int ihl = ip_hdrlen(pkt);
952
	struct sock *mroute_sk;
L
Linus Torvalds 已提交
953 954
	struct igmphdr *igmp;
	struct igmpmsg *msg;
955
	struct sk_buff *skb;
L
Linus Torvalds 已提交
956 957 958 959 960 961 962
	int ret;

	if (assert == IGMPMSG_WHOLEPKT)
		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
	else
		skb = alloc_skb(128, GFP_ATOMIC);

S
Stephen Hemminger 已提交
963
	if (!skb)
L
Linus Torvalds 已提交
964 965 966 967
		return -ENOBUFS;

	if (assert == IGMPMSG_WHOLEPKT) {
		/* Ugly, but we have no choice with this interface.
E
Eric Dumazet 已提交
968 969 970
		 * Duplicate old header, fix ihl, length etc.
		 * And all this only to mangle msg->im_msgtype and
		 * to set msg->im_mbz to "mbz" :-)
L
Linus Torvalds 已提交
971
		 */
972 973
		skb_push(skb, sizeof(struct iphdr));
		skb_reset_network_header(skb);
974
		skb_reset_transport_header(skb);
975
		msg = (struct igmpmsg *)skb_network_header(skb);
976
		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
L
Linus Torvalds 已提交
977 978
		msg->im_msgtype = IGMPMSG_WHOLEPKT;
		msg->im_mbz = 0;
979
		msg->im_vif = mrt->mroute_reg_vif_num;
980 981 982
		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
					     sizeof(struct iphdr));
983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999
	} else {
		/* Copy the IP header */
		skb_set_network_header(skb, skb->len);
		skb_put(skb, ihl);
		skb_copy_to_linear_data(skb, pkt->data, ihl);
		/* Flag to the kernel this is a route add */
		ip_hdr(skb)->protocol = 0;
		msg = (struct igmpmsg *)skb_network_header(skb);
		msg->im_vif = vifi;
		skb_dst_set(skb, dst_clone(skb_dst(pkt)));
		/* Add our header */
		igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
		igmp->type = assert;
		msg->im_msgtype = assert;
		igmp->code = 0;
		ip_hdr(skb)->tot_len = htons(skb->len);	/* Fix the length */
		skb->transport_header = skb->network_header;
1000
	}
L
Linus Torvalds 已提交
1001

E
Eric Dumazet 已提交
1002 1003
	rcu_read_lock();
	mroute_sk = rcu_dereference(mrt->mroute_sk);
1004
	if (!mroute_sk) {
E
Eric Dumazet 已提交
1005
		rcu_read_unlock();
L
Linus Torvalds 已提交
1006 1007 1008 1009
		kfree_skb(skb);
		return -EINVAL;
	}

E
Eric Dumazet 已提交
1010
	/* Deliver to mrouted */
E
Eric Dumazet 已提交
1011 1012
	ret = sock_queue_rcv_skb(mroute_sk, skb);
	rcu_read_unlock();
1013
	if (ret < 0) {
1014
		net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
L
Linus Torvalds 已提交
1015 1016 1017 1018 1019 1020
		kfree_skb(skb);
	}

	return ret;
}

1021 1022 1023
/* Queue a packet for resolution. It gets locked cache entry! */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
				 struct sk_buff *skb)
L
Linus Torvalds 已提交
1024
{
1025
	bool found = false;
L
Linus Torvalds 已提交
1026 1027
	int err;
	struct mfc_cache *c;
1028
	const struct iphdr *iph = ip_hdr(skb);
L
Linus Torvalds 已提交
1029 1030

	spin_lock_bh(&mfc_unres_lock);
1031
	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
1032
		if (c->mfc_mcastgrp == iph->daddr &&
1033 1034
		    c->mfc_origin == iph->saddr) {
			found = true;
L
Linus Torvalds 已提交
1035
			break;
1036
		}
L
Linus Torvalds 已提交
1037 1038
	}

1039
	if (!found) {
E
Eric Dumazet 已提交
1040
		/* Create a new entry if allowable */
1041
		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
1042
		    (c = ipmr_cache_alloc_unres()) == NULL) {
L
Linus Torvalds 已提交
1043 1044 1045 1046 1047 1048
			spin_unlock_bh(&mfc_unres_lock);

			kfree_skb(skb);
			return -ENOBUFS;
		}

E
Eric Dumazet 已提交
1049
		/* Fill in the new cache entry */
1050 1051 1052
		c->mfc_parent	= -1;
		c->mfc_origin	= iph->saddr;
		c->mfc_mcastgrp	= iph->daddr;
L
Linus Torvalds 已提交
1053

E
Eric Dumazet 已提交
1054
		/* Reflect first query at mrouted. */
1055
		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
1056
		if (err < 0) {
1057
			/* If the report failed throw the cache entry
L
Linus Torvalds 已提交
1058 1059 1060 1061
			   out - Brad Parker
			 */
			spin_unlock_bh(&mfc_unres_lock);

1062
			ipmr_cache_free(c);
L
Linus Torvalds 已提交
1063 1064 1065 1066
			kfree_skb(skb);
			return err;
		}

1067 1068
		atomic_inc(&mrt->cache_resolve_queue_len);
		list_add(&c->list, &mrt->mfc_unres_queue);
1069
		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1070

1071 1072
		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
L
Linus Torvalds 已提交
1073 1074
	}

E
Eric Dumazet 已提交
1075 1076
	/* See if we can append the packet */
	if (c->mfc_un.unres.unresolved.qlen > 3) {
L
Linus Torvalds 已提交
1077 1078 1079
		kfree_skb(skb);
		err = -ENOBUFS;
	} else {
J
Jianjun Kong 已提交
1080
		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
L
Linus Torvalds 已提交
1081 1082 1083 1084 1085 1086 1087
		err = 0;
	}

	spin_unlock_bh(&mfc_unres_lock);
	return err;
}

1088
/* MFC cache manipulation by user space mroute daemon */
L
Linus Torvalds 已提交
1089

1090
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
L
Linus Torvalds 已提交
1091 1092
{
	int line;
1093
	struct mfc_cache *c, *next;
L
Linus Torvalds 已提交
1094

J
Jianjun Kong 已提交
1095
	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
L
Linus Torvalds 已提交
1096

1097
	list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
L
Linus Torvalds 已提交
1098
		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1099 1100
		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
		    (parent == -1 || parent == c->mfc_parent)) {
1101
			list_del_rcu(&c->list);
1102
			mroute_netlink_event(mrt, c, RTM_DELROUTE);
1103
			ipmr_cache_free(c);
L
Linus Torvalds 已提交
1104 1105 1106 1107 1108 1109
			return 0;
		}
	}
	return -ENOENT;
}

1110
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1111
			struct mfcctl *mfc, int mrtsock, int parent)
L
Linus Torvalds 已提交
1112
{
1113
	bool found = false;
L
Linus Torvalds 已提交
1114
	int line;
1115
	struct mfc_cache *uc, *c;
L
Linus Torvalds 已提交
1116

1117 1118 1119
	if (mfc->mfcc_parent >= MAXVIFS)
		return -ENFILE;

J
Jianjun Kong 已提交
1120
	line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
L
Linus Torvalds 已提交
1121

1122
	list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
L
Linus Torvalds 已提交
1123
		if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1124 1125
		    c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr &&
		    (parent == -1 || parent == c->mfc_parent)) {
1126
			found = true;
L
Linus Torvalds 已提交
1127
			break;
1128
		}
L
Linus Torvalds 已提交
1129 1130
	}

1131
	if (found) {
L
Linus Torvalds 已提交
1132 1133
		write_lock_bh(&mrt_lock);
		c->mfc_parent = mfc->mfcc_parent;
1134
		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
L
Linus Torvalds 已提交
1135 1136 1137
		if (!mrtsock)
			c->mfc_flags |= MFC_STATIC;
		write_unlock_bh(&mrt_lock);
1138
		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1139 1140 1141
		return 0;
	}

1142
	if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
1143
	    !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
L
Linus Torvalds 已提交
1144 1145
		return -EINVAL;

1146
	c = ipmr_cache_alloc();
1147
	if (!c)
L
Linus Torvalds 已提交
1148 1149
		return -ENOMEM;

J
Jianjun Kong 已提交
1150 1151 1152
	c->mfc_origin = mfc->mfcc_origin.s_addr;
	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
	c->mfc_parent = mfc->mfcc_parent;
1153
	ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
L
Linus Torvalds 已提交
1154 1155 1156
	if (!mrtsock)
		c->mfc_flags |= MFC_STATIC;

1157
	list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
L
Linus Torvalds 已提交
1158

1159 1160
	/* Check to see if we resolved a queued list. If so we
	 * need to send on the frames and tidy up.
L
Linus Torvalds 已提交
1161
	 */
1162
	found = false;
L
Linus Torvalds 已提交
1163
	spin_lock_bh(&mfc_unres_lock);
1164
	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1165
		if (uc->mfc_origin == c->mfc_origin &&
L
Linus Torvalds 已提交
1166
		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1167
			list_del(&uc->list);
1168
			atomic_dec(&mrt->cache_resolve_queue_len);
1169
			found = true;
L
Linus Torvalds 已提交
1170 1171 1172
			break;
		}
	}
1173 1174
	if (list_empty(&mrt->mfc_unres_queue))
		del_timer(&mrt->ipmr_expire_timer);
L
Linus Torvalds 已提交
1175 1176
	spin_unlock_bh(&mfc_unres_lock);

1177
	if (found) {
1178
		ipmr_cache_resolve(net, mrt, uc, c);
1179
		ipmr_cache_free(uc);
L
Linus Torvalds 已提交
1180
	}
1181
	mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1182 1183 1184
	return 0;
}

1185
/* Close the multicast socket, and clear the vif tables etc */
1186
static void mroute_clean_tables(struct mr_table *mrt, bool all)
L
Linus Torvalds 已提交
1187 1188
{
	int i;
1189
	LIST_HEAD(list);
1190
	struct mfc_cache *c, *next;
1191

E
Eric Dumazet 已提交
1192
	/* Shut down all active vif entries */
1193
	for (i = 0; i < mrt->maxvif; i++) {
1194 1195 1196
		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
			continue;
		vif_delete(mrt, i, 0, &list);
L
Linus Torvalds 已提交
1197
	}
1198
	unregister_netdevice_many(&list);
L
Linus Torvalds 已提交
1199

E
Eric Dumazet 已提交
1200
	/* Wipe the cache */
1201
	for (i = 0; i < MFC_LINES; i++) {
1202
		list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1203
			if (!all && (c->mfc_flags & MFC_STATIC))
L
Linus Torvalds 已提交
1204
				continue;
1205
			list_del_rcu(&c->list);
1206
			mroute_netlink_event(mrt, c, RTM_DELROUTE);
1207
			ipmr_cache_free(c);
L
Linus Torvalds 已提交
1208 1209 1210
		}
	}

1211
	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
L
Linus Torvalds 已提交
1212
		spin_lock_bh(&mfc_unres_lock);
1213
		list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1214
			list_del(&c->list);
1215
			mroute_netlink_event(mrt, c, RTM_DELROUTE);
1216
			ipmr_destroy_unres(mrt, c);
L
Linus Torvalds 已提交
1217 1218 1219 1220 1221
		}
		spin_unlock_bh(&mfc_unres_lock);
	}
}

E
Eric Dumazet 已提交
1222 1223 1224
/* called from ip_ra_control(), before an RCU grace period,
 * we dont need to call synchronize_rcu() here
 */
L
Linus Torvalds 已提交
1225 1226
static void mrtsock_destruct(struct sock *sk)
{
1227
	struct net *net = sock_net(sk);
1228
	struct mr_table *mrt;
1229

L
Linus Torvalds 已提交
1230
	rtnl_lock();
1231
	ipmr_for_each_table(mrt, net) {
E
Eric Dumazet 已提交
1232
		if (sk == rtnl_dereference(mrt->mroute_sk)) {
1233
			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1234 1235 1236
			inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
						    NETCONFA_IFINDEX_ALL,
						    net->ipv4.devconf_all);
1237
			RCU_INIT_POINTER(mrt->mroute_sk, NULL);
1238
			mroute_clean_tables(mrt, false);
1239
		}
L
Linus Torvalds 已提交
1240 1241 1242 1243
	}
	rtnl_unlock();
}

1244 1245 1246 1247
/* Socket options and virtual interface manipulation. The whole
 * virtual interface system is a complete heap, but unfortunately
 * that's how BSD mrouted happens to think. Maybe one day with a proper
 * MOSPF/PIM router set up we can clean this up.
L
Linus Torvalds 已提交
1248
 */
1249

1250 1251
int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
			 unsigned int optlen)
L
Linus Torvalds 已提交
1252
{
1253
	struct net *net = sock_net(sk);
1254
	int val, ret = 0, parent = 0;
1255
	struct mr_table *mrt;
1256 1257 1258
	struct vifctl vif;
	struct mfcctl mfc;
	u32 uval;
1259

1260 1261
	/* There's one exception to the lock - MRT_DONE which needs to unlock */
	rtnl_lock();
1262
	if (sk->sk_type != SOCK_RAW ||
1263 1264 1265 1266
	    inet_sk(sk)->inet_num != IPPROTO_IGMP) {
		ret = -EOPNOTSUPP;
		goto out_unlock;
	}
1267

1268
	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1269 1270 1271 1272
	if (!mrt) {
		ret = -ENOENT;
		goto out_unlock;
	}
S
Stephen Hemminger 已提交
1273
	if (optname != MRT_INIT) {
1274
		if (sk != rcu_access_pointer(mrt->mroute_sk) &&
1275 1276 1277 1278
		    !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
			ret = -EACCES;
			goto out_unlock;
		}
L
Linus Torvalds 已提交
1279 1280
	}

S
Stephen Hemminger 已提交
1281 1282
	switch (optname) {
	case MRT_INIT:
1283
		if (optlen != sizeof(int)) {
1284
			ret = -EINVAL;
1285 1286 1287
			break;
		}
		if (rtnl_dereference(mrt->mroute_sk)) {
1288 1289
			ret = -EADDRINUSE;
			break;
1290
		}
S
Stephen Hemminger 已提交
1291 1292 1293

		ret = ip_ra_control(sk, 1, mrtsock_destruct);
		if (ret == 0) {
1294
			rcu_assign_pointer(mrt->mroute_sk, sk);
1295
			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1296 1297 1298
			inet_netconf_notify_devconf(net, NETCONFA_MC_FORWARDING,
						    NETCONFA_IFINDEX_ALL,
						    net->ipv4.devconf_all);
S
Stephen Hemminger 已提交
1299
		}
1300
		break;
S
Stephen Hemminger 已提交
1301
	case MRT_DONE:
1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313
		if (sk != rcu_access_pointer(mrt->mroute_sk)) {
			ret = -EACCES;
		} else {
			/* We need to unlock here because mrtsock_destruct takes
			 * care of rtnl itself and we can't change that due to
			 * the IP_ROUTER_ALERT setsockopt which runs without it.
			 */
			rtnl_unlock();
			ret = ip_ra_control(sk, 0, NULL);
			goto out;
		}
		break;
S
Stephen Hemminger 已提交
1314 1315
	case MRT_ADD_VIF:
	case MRT_DEL_VIF:
1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327
		if (optlen != sizeof(vif)) {
			ret = -EINVAL;
			break;
		}
		if (copy_from_user(&vif, optval, sizeof(vif))) {
			ret = -EFAULT;
			break;
		}
		if (vif.vifc_vifi >= MAXVIFS) {
			ret = -ENFILE;
			break;
		}
J
Jianjun Kong 已提交
1328
		if (optname == MRT_ADD_VIF) {
E
Eric Dumazet 已提交
1329 1330
			ret = vif_add(net, mrt, &vif,
				      sk == rtnl_dereference(mrt->mroute_sk));
S
Stephen Hemminger 已提交
1331
		} else {
1332
			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
S
Stephen Hemminger 已提交
1333
		}
1334
		break;
1335 1336 1337
	/* Manipulate the forwarding caches. These live
	 * in a sort of kernel/user symbiosis.
	 */
S
Stephen Hemminger 已提交
1338 1339
	case MRT_ADD_MFC:
	case MRT_DEL_MFC:
1340 1341 1342
		parent = -1;
	case MRT_ADD_MFC_PROXY:
	case MRT_DEL_MFC_PROXY:
1343 1344 1345 1346 1347 1348 1349 1350
		if (optlen != sizeof(mfc)) {
			ret = -EINVAL;
			break;
		}
		if (copy_from_user(&mfc, optval, sizeof(mfc))) {
			ret = -EFAULT;
			break;
		}
1351 1352 1353 1354
		if (parent == 0)
			parent = mfc.mfcc_parent;
		if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
			ret = ipmr_mfc_delete(mrt, &mfc, parent);
S
Stephen Hemminger 已提交
1355
		else
E
Eric Dumazet 已提交
1356
			ret = ipmr_mfc_add(net, mrt, &mfc,
1357 1358
					   sk == rtnl_dereference(mrt->mroute_sk),
					   parent);
1359
		break;
1360
	/* Control PIM assert. */
S
Stephen Hemminger 已提交
1361
	case MRT_ASSERT:
1362 1363 1364 1365 1366 1367 1368 1369 1370 1371
		if (optlen != sizeof(val)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(val, (int __user *)optval)) {
			ret = -EFAULT;
			break;
		}
		mrt->mroute_do_assert = val;
		break;
S
Stephen Hemminger 已提交
1372
	case MRT_PIM:
1373
		if (!ipmr_pimsm_enabled()) {
1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
			ret = -ENOPROTOOPT;
			break;
		}
		if (optlen != sizeof(val)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(val, (int __user *)optval)) {
			ret = -EFAULT;
			break;
		}
S
Stephen Hemminger 已提交
1385

1386 1387 1388 1389
		val = !!val;
		if (val != mrt->mroute_do_pim) {
			mrt->mroute_do_pim = val;
			mrt->mroute_do_assert = val;
L
Linus Torvalds 已提交
1390
		}
1391
		break;
1392
	case MRT_TABLE:
1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404
		if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
			ret = -ENOPROTOOPT;
			break;
		}
		if (optlen != sizeof(uval)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(uval, (u32 __user *)optval)) {
			ret = -EFAULT;
			break;
		}
1405

E
Eric Dumazet 已提交
1406 1407 1408
		if (sk == rtnl_dereference(mrt->mroute_sk)) {
			ret = -EBUSY;
		} else {
1409
			mrt = ipmr_new_table(net, uval);
1410 1411
			if (IS_ERR(mrt))
				ret = PTR_ERR(mrt);
1412
			else
1413
				raw_sk(sk)->ipmr_table = uval;
E
Eric Dumazet 已提交
1414
		}
1415
		break;
1416
	/* Spurious command, or MRT_VERSION which you cannot set. */
S
Stephen Hemminger 已提交
1417
	default:
1418
		ret = -ENOPROTOOPT;
L
Linus Torvalds 已提交
1419
	}
1420 1421 1422 1423
out_unlock:
	rtnl_unlock();
out:
	return ret;
L
Linus Torvalds 已提交
1424 1425
}

1426
/* Getsock opt support for the multicast routing system. */
J
Jianjun Kong 已提交
1427
int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
L
Linus Torvalds 已提交
1428 1429 1430
{
	int olr;
	int val;
1431
	struct net *net = sock_net(sk);
1432 1433
	struct mr_table *mrt;

1434 1435 1436 1437
	if (sk->sk_type != SOCK_RAW ||
	    inet_sk(sk)->inet_num != IPPROTO_IGMP)
		return -EOPNOTSUPP;

1438
	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1439
	if (!mrt)
1440
		return -ENOENT;
L
Linus Torvalds 已提交
1441

1442 1443 1444 1445 1446
	switch (optname) {
	case MRT_VERSION:
		val = 0x0305;
		break;
	case MRT_PIM:
1447
		if (!ipmr_pimsm_enabled())
1448 1449 1450 1451 1452 1453 1454
			return -ENOPROTOOPT;
		val = mrt->mroute_do_pim;
		break;
	case MRT_ASSERT:
		val = mrt->mroute_do_assert;
		break;
	default:
L
Linus Torvalds 已提交
1455
		return -ENOPROTOOPT;
1456
	}
L
Linus Torvalds 已提交
1457 1458 1459 1460 1461 1462

	if (get_user(olr, optlen))
		return -EFAULT;
	olr = min_t(unsigned int, olr, sizeof(int));
	if (olr < 0)
		return -EINVAL;
J
Jianjun Kong 已提交
1463
	if (put_user(olr, optlen))
L
Linus Torvalds 已提交
1464
		return -EFAULT;
J
Jianjun Kong 已提交
1465
	if (copy_to_user(optval, &val, olr))
L
Linus Torvalds 已提交
1466 1467 1468 1469
		return -EFAULT;
	return 0;
}

1470
/* The IP multicast ioctl support routines. */
L
Linus Torvalds 已提交
1471 1472 1473 1474 1475 1476
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
{
	struct sioc_sg_req sr;
	struct sioc_vif_req vr;
	struct vif_device *vif;
	struct mfc_cache *c;
1477
	struct net *net = sock_net(sk);
1478 1479 1480
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1481
	if (!mrt)
1482
		return -ENOENT;
1483

S
Stephen Hemminger 已提交
1484 1485
	switch (cmd) {
	case SIOCGETVIFCNT:
J
Jianjun Kong 已提交
1486
		if (copy_from_user(&vr, arg, sizeof(vr)))
S
Stephen Hemminger 已提交
1487
			return -EFAULT;
1488
		if (vr.vifi >= mrt->maxvif)
S
Stephen Hemminger 已提交
1489 1490
			return -EINVAL;
		read_lock(&mrt_lock);
1491 1492
		vif = &mrt->vif_table[vr.vifi];
		if (VIF_EXISTS(mrt, vr.vifi)) {
J
Jianjun Kong 已提交
1493 1494 1495 1496
			vr.icount = vif->pkt_in;
			vr.ocount = vif->pkt_out;
			vr.ibytes = vif->bytes_in;
			vr.obytes = vif->bytes_out;
L
Linus Torvalds 已提交
1497 1498
			read_unlock(&mrt_lock);

J
Jianjun Kong 已提交
1499
			if (copy_to_user(arg, &vr, sizeof(vr)))
S
Stephen Hemminger 已提交
1500 1501 1502 1503 1504 1505
				return -EFAULT;
			return 0;
		}
		read_unlock(&mrt_lock);
		return -EADDRNOTAVAIL;
	case SIOCGETSGCNT:
J
Jianjun Kong 已提交
1506
		if (copy_from_user(&sr, arg, sizeof(sr)))
S
Stephen Hemminger 已提交
1507 1508
			return -EFAULT;

1509
		rcu_read_lock();
1510
		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
S
Stephen Hemminger 已提交
1511 1512 1513 1514
		if (c) {
			sr.pktcnt = c->mfc_un.res.pkt;
			sr.bytecnt = c->mfc_un.res.bytes;
			sr.wrong_if = c->mfc_un.res.wrong_if;
1515
			rcu_read_unlock();
S
Stephen Hemminger 已提交
1516

J
Jianjun Kong 已提交
1517
			if (copy_to_user(arg, &sr, sizeof(sr)))
S
Stephen Hemminger 已提交
1518 1519 1520
				return -EFAULT;
			return 0;
		}
1521
		rcu_read_unlock();
S
Stephen Hemminger 已提交
1522 1523 1524
		return -EADDRNOTAVAIL;
	default:
		return -ENOIOCTLCMD;
L
Linus Torvalds 已提交
1525 1526 1527
	}
}

1528 1529 1530 1531 1532 1533 1534 1535 1536
#ifdef CONFIG_COMPAT
struct compat_sioc_sg_req {
	struct in_addr src;
	struct in_addr grp;
	compat_ulong_t pktcnt;
	compat_ulong_t bytecnt;
	compat_ulong_t wrong_if;
};

1537 1538 1539 1540 1541 1542 1543 1544
struct compat_sioc_vif_req {
	vifi_t	vifi;		/* Which iface */
	compat_ulong_t icount;
	compat_ulong_t ocount;
	compat_ulong_t ibytes;
	compat_ulong_t obytes;
};

1545 1546
int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
1547
	struct compat_sioc_sg_req sr;
1548 1549
	struct compat_sioc_vif_req vr;
	struct vif_device *vif;
1550 1551 1552 1553 1554
	struct mfc_cache *c;
	struct net *net = sock_net(sk);
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1555
	if (!mrt)
1556 1557 1558
		return -ENOENT;

	switch (cmd) {
1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578
	case SIOCGETVIFCNT:
		if (copy_from_user(&vr, arg, sizeof(vr)))
			return -EFAULT;
		if (vr.vifi >= mrt->maxvif)
			return -EINVAL;
		read_lock(&mrt_lock);
		vif = &mrt->vif_table[vr.vifi];
		if (VIF_EXISTS(mrt, vr.vifi)) {
			vr.icount = vif->pkt_in;
			vr.ocount = vif->pkt_out;
			vr.ibytes = vif->bytes_in;
			vr.obytes = vif->bytes_out;
			read_unlock(&mrt_lock);

			if (copy_to_user(arg, &vr, sizeof(vr)))
				return -EFAULT;
			return 0;
		}
		read_unlock(&mrt_lock);
		return -EADDRNOTAVAIL;
1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602
	case SIOCGETSGCNT:
		if (copy_from_user(&sr, arg, sizeof(sr)))
			return -EFAULT;

		rcu_read_lock();
		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
		if (c) {
			sr.pktcnt = c->mfc_un.res.pkt;
			sr.bytecnt = c->mfc_un.res.bytes;
			sr.wrong_if = c->mfc_un.res.wrong_if;
			rcu_read_unlock();

			if (copy_to_user(arg, &sr, sizeof(sr)))
				return -EFAULT;
			return 0;
		}
		rcu_read_unlock();
		return -EADDRNOTAVAIL;
	default:
		return -ENOIOCTLCMD;
	}
}
#endif

L
Linus Torvalds 已提交
1603 1604
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
1605
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1606
	struct net *net = dev_net(dev);
1607
	struct mr_table *mrt;
L
Linus Torvalds 已提交
1608 1609
	struct vif_device *v;
	int ct;
1610

L
Linus Torvalds 已提交
1611 1612
	if (event != NETDEV_UNREGISTER)
		return NOTIFY_DONE;
1613 1614 1615 1616 1617

	ipmr_for_each_table(mrt, net) {
		v = &mrt->vif_table[0];
		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
			if (v->dev == dev)
1618
				vif_delete(mrt, ct, 1, NULL);
1619
		}
L
Linus Torvalds 已提交
1620 1621 1622 1623
	}
	return NOTIFY_DONE;
}

J
Jianjun Kong 已提交
1624
static struct notifier_block ip_mr_notifier = {
L
Linus Torvalds 已提交
1625 1626 1627
	.notifier_call = ipmr_device_event,
};

1628 1629 1630
/* Encapsulate a packet by attaching a valid IPIP header to it.
 * This avoids tunnel drivers and other mess and gives us the speed so
 * important for multicast video.
L
Linus Torvalds 已提交
1631
 */
1632 1633
static void ip_encap(struct net *net, struct sk_buff *skb,
		     __be32 saddr, __be32 daddr)
L
Linus Torvalds 已提交
1634
{
1635
	struct iphdr *iph;
1636
	const struct iphdr *old_iph = ip_hdr(skb);
1637 1638

	skb_push(skb, sizeof(struct iphdr));
1639
	skb->transport_header = skb->network_header;
1640
	skb_reset_network_header(skb);
1641
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
1642

E
Eric Dumazet 已提交
1643
	iph->version	=	4;
1644 1645
	iph->tos	=	old_iph->tos;
	iph->ttl	=	old_iph->ttl;
L
Linus Torvalds 已提交
1646 1647 1648 1649 1650 1651
	iph->frag_off	=	0;
	iph->daddr	=	daddr;
	iph->saddr	=	saddr;
	iph->protocol	=	IPPROTO_IPIP;
	iph->ihl	=	5;
	iph->tot_len	=	htons(skb->len);
1652
	ip_select_ident(net, skb, NULL);
L
Linus Torvalds 已提交
1653 1654 1655 1656 1657 1658
	ip_send_check(iph);

	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
	nf_reset(skb);
}

1659 1660
static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
				      struct sk_buff *skb)
L
Linus Torvalds 已提交
1661
{
E
Eric Dumazet 已提交
1662
	struct ip_options *opt = &(IPCB(skb)->opt);
L
Linus Torvalds 已提交
1663

1664 1665
	IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
	IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
L
Linus Torvalds 已提交
1666 1667 1668 1669

	if (unlikely(opt->optlen))
		ip_forward_options(skb);

1670
	return dst_output(net, sk, skb);
L
Linus Torvalds 已提交
1671 1672
}

1673
/* Processing handlers for ipmr_forward */
L
Linus Torvalds 已提交
1674

1675 1676
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
			    struct sk_buff *skb, struct mfc_cache *c, int vifi)
L
Linus Torvalds 已提交
1677
{
1678
	const struct iphdr *iph = ip_hdr(skb);
1679
	struct vif_device *vif = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
1680 1681
	struct net_device *dev;
	struct rtable *rt;
1682
	struct flowi4 fl4;
L
Linus Torvalds 已提交
1683 1684
	int    encap = 0;

1685
	if (!vif->dev)
L
Linus Torvalds 已提交
1686 1687 1688 1689
		goto out_free;

	if (vif->flags & VIFF_REGISTER) {
		vif->pkt_out++;
J
Jianjun Kong 已提交
1690
		vif->bytes_out += skb->len;
1691 1692
		vif->dev->stats.tx_bytes += skb->len;
		vif->dev->stats.tx_packets++;
1693
		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1694
		goto out_free;
L
Linus Torvalds 已提交
1695 1696
	}

E
Eric Dumazet 已提交
1697
	if (vif->flags & VIFF_TUNNEL) {
1698
		rt = ip_route_output_ports(net, &fl4, NULL,
1699 1700 1701 1702
					   vif->remote, vif->local,
					   0, 0,
					   IPPROTO_IPIP,
					   RT_TOS(iph->tos), vif->link);
1703
		if (IS_ERR(rt))
L
Linus Torvalds 已提交
1704 1705 1706
			goto out_free;
		encap = sizeof(struct iphdr);
	} else {
1707
		rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
1708 1709 1710
					   0, 0,
					   IPPROTO_IPIP,
					   RT_TOS(iph->tos), vif->link);
1711
		if (IS_ERR(rt))
L
Linus Torvalds 已提交
1712 1713 1714
			goto out_free;
	}

1715
	dev = rt->dst.dev;
L
Linus Torvalds 已提交
1716

1717
	if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
L
Linus Torvalds 已提交
1718
		/* Do not fragment multicasts. Alas, IPv4 does not
E
Eric Dumazet 已提交
1719 1720
		 * allow to send ICMP, so that packets will disappear
		 * to blackhole.
L
Linus Torvalds 已提交
1721
		 */
1722
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
1723 1724 1725 1726
		ip_rt_put(rt);
		goto out_free;
	}

1727
	encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
L
Linus Torvalds 已提交
1728 1729

	if (skb_cow(skb, encap)) {
1730
		ip_rt_put(rt);
L
Linus Torvalds 已提交
1731 1732 1733 1734
		goto out_free;
	}

	vif->pkt_out++;
J
Jianjun Kong 已提交
1735
	vif->bytes_out += skb->len;
L
Linus Torvalds 已提交
1736

E
Eric Dumazet 已提交
1737
	skb_dst_drop(skb);
1738
	skb_dst_set(skb, &rt->dst);
1739
	ip_decrease_ttl(ip_hdr(skb));
L
Linus Torvalds 已提交
1740 1741

	/* FIXME: forward and output firewalls used to be called here.
E
Eric Dumazet 已提交
1742 1743
	 * What do we do with netfilter? -- RR
	 */
L
Linus Torvalds 已提交
1744
	if (vif->flags & VIFF_TUNNEL) {
1745
		ip_encap(net, skb, vif->local, vif->remote);
L
Linus Torvalds 已提交
1746
		/* FIXME: extra output firewall step used to be here. --RR */
1747 1748
		vif->dev->stats.tx_packets++;
		vif->dev->stats.tx_bytes += skb->len;
L
Linus Torvalds 已提交
1749 1750 1751 1752
	}

	IPCB(skb)->flags |= IPSKB_FORWARDED;

1753
	/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
L
Linus Torvalds 已提交
1754 1755 1756 1757 1758 1759 1760 1761 1762
	 * not only before forwarding, but after forwarding on all output
	 * interfaces. It is clear, if mrouter runs a multicasting
	 * program, it should receive packets not depending to what interface
	 * program is joined.
	 * If we will not make it, the program will have to join on all
	 * interfaces. On the other hand, multihoming host (or router, but
	 * not mrouter) cannot join to more than one interface - it will
	 * result in receiving multiple packets.
	 */
1763 1764
	NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
		net, NULL, skb, skb->dev, dev,
L
Linus Torvalds 已提交
1765 1766 1767 1768 1769 1770 1771
		ipmr_forward_finish);
	return;

out_free:
	kfree_skb(skb);
}

1772
static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
L
Linus Torvalds 已提交
1773 1774
{
	int ct;
1775 1776 1777

	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
		if (mrt->vif_table[ct].dev == dev)
L
Linus Torvalds 已提交
1778 1779 1780 1781 1782 1783
			break;
	}
	return ct;
}

/* "local" means that we should preserve one skb (for local delivery) */
1784 1785 1786
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
			  struct sk_buff *skb, struct mfc_cache *cache,
			  int local)
L
Linus Torvalds 已提交
1787 1788 1789
{
	int psend = -1;
	int vif, ct;
1790
	int true_vifi = ipmr_find_vif(mrt, skb->dev);
L
Linus Torvalds 已提交
1791 1792 1793 1794 1795

	vif = cache->mfc_parent;
	cache->mfc_un.res.pkt++;
	cache->mfc_un.res.bytes += skb->len;

1796
	if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807
		struct mfc_cache *cache_proxy;

		/* For an (*,G) entry, we only check that the incomming
		 * interface is part of the static tree.
		 */
		cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
		if (cache_proxy &&
		    cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
			goto forward;
	}

1808
	/* Wrong interface: drop packet and (maybe) send PIM assert. */
1809
	if (mrt->vif_table[vif].dev != skb->dev) {
1810
		if (rt_is_output_route(skb_rtable(skb))) {
L
Linus Torvalds 已提交
1811
			/* It is our own packet, looped back.
E
Eric Dumazet 已提交
1812 1813 1814 1815 1816 1817 1818 1819 1820
			 * Very complicated situation...
			 *
			 * The best workaround until routing daemons will be
			 * fixed is not to redistribute packet, if it was
			 * send through wrong interface. It means, that
			 * multicast applications WILL NOT work for
			 * (S,G), which have default multicast route pointing
			 * to wrong oif. In any case, it is not a good
			 * idea to use multicasting applications on router.
L
Linus Torvalds 已提交
1821 1822 1823 1824 1825 1826
			 */
			goto dont_forward;
		}

		cache->mfc_un.res.wrong_if++;

1827
		if (true_vifi >= 0 && mrt->mroute_do_assert &&
L
Linus Torvalds 已提交
1828
		    /* pimsm uses asserts, when switching from RPT to SPT,
E
Eric Dumazet 已提交
1829 1830 1831
		     * so that we cannot check that packet arrived on an oif.
		     * It is bad, but otherwise we would need to move pretty
		     * large chunk of pimd to kernel. Ough... --ANK
L
Linus Torvalds 已提交
1832
		     */
1833
		    (mrt->mroute_do_pim ||
1834
		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
1835
		    time_after(jiffies,
L
Linus Torvalds 已提交
1836 1837
			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
			cache->mfc_un.res.last_assert = jiffies;
1838
			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
L
Linus Torvalds 已提交
1839 1840 1841 1842
		}
		goto dont_forward;
	}

1843
forward:
1844 1845
	mrt->vif_table[vif].pkt_in++;
	mrt->vif_table[vif].bytes_in += skb->len;
L
Linus Torvalds 已提交
1846

1847
	/* Forward the frame */
1848 1849
	if (cache->mfc_origin == htonl(INADDR_ANY) &&
	    cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862
		if (true_vifi >= 0 &&
		    true_vifi != cache->mfc_parent &&
		    ip_hdr(skb)->ttl >
				cache->mfc_un.res.ttls[cache->mfc_parent]) {
			/* It's an (*,*) entry and the packet is not coming from
			 * the upstream: forward the packet to the upstream
			 * only.
			 */
			psend = cache->mfc_parent;
			goto last_forward;
		}
		goto dont_forward;
	}
E
Eric Dumazet 已提交
1863 1864
	for (ct = cache->mfc_un.res.maxvif - 1;
	     ct >= cache->mfc_un.res.minvif; ct--) {
1865
		/* For (*,G) entry, don't forward to the incoming interface */
1866 1867
		if ((cache->mfc_origin != htonl(INADDR_ANY) ||
		     ct != true_vifi) &&
1868
		    ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
L
Linus Torvalds 已提交
1869 1870
			if (psend != -1) {
				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
E
Eric Dumazet 已提交
1871

L
Linus Torvalds 已提交
1872
				if (skb2)
1873 1874
					ipmr_queue_xmit(net, mrt, skb2, cache,
							psend);
L
Linus Torvalds 已提交
1875
			}
J
Jianjun Kong 已提交
1876
			psend = ct;
L
Linus Torvalds 已提交
1877 1878
		}
	}
1879
last_forward:
L
Linus Torvalds 已提交
1880 1881 1882
	if (psend != -1) {
		if (local) {
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
E
Eric Dumazet 已提交
1883

L
Linus Torvalds 已提交
1884
			if (skb2)
1885
				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
L
Linus Torvalds 已提交
1886
		} else {
1887
			ipmr_queue_xmit(net, mrt, skb, cache, psend);
1888
			return;
L
Linus Torvalds 已提交
1889 1890 1891 1892 1893 1894 1895 1896
		}
	}

dont_forward:
	if (!local)
		kfree_skb(skb);
}

1897
static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
1898
{
1899 1900
	struct rtable *rt = skb_rtable(skb);
	struct iphdr *iph = ip_hdr(skb);
D
David S. Miller 已提交
1901
	struct flowi4 fl4 = {
1902 1903
		.daddr = iph->daddr,
		.saddr = iph->saddr,
1904
		.flowi4_tos = RT_TOS(iph->tos),
D
David S. Miller 已提交
1905 1906 1907
		.flowi4_oif = (rt_is_output_route(rt) ?
			       skb->dev->ifindex : 0),
		.flowi4_iif = (rt_is_output_route(rt) ?
1908
			       LOOPBACK_IFINDEX :
D
David S. Miller 已提交
1909
			       skb->dev->ifindex),
1910
		.flowi4_mark = skb->mark,
1911 1912 1913 1914
	};
	struct mr_table *mrt;
	int err;

D
David S. Miller 已提交
1915
	err = ipmr_fib_lookup(net, &fl4, &mrt);
1916 1917 1918 1919
	if (err)
		return ERR_PTR(err);
	return mrt;
}
L
Linus Torvalds 已提交
1920

1921 1922
/* Multicast packets for forwarding arrive here
 * Called with rcu_read_lock();
L
Linus Torvalds 已提交
1923 1924 1925 1926
 */
int ip_mr_input(struct sk_buff *skb)
{
	struct mfc_cache *cache;
1927
	struct net *net = dev_net(skb->dev);
E
Eric Dumazet 已提交
1928
	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1929
	struct mr_table *mrt;
L
Linus Torvalds 已提交
1930 1931

	/* Packet is looped back after forward, it should not be
E
Eric Dumazet 已提交
1932
	 * forwarded second time, but still can be delivered locally.
L
Linus Torvalds 已提交
1933
	 */
E
Eric Dumazet 已提交
1934
	if (IPCB(skb)->flags & IPSKB_FORWARDED)
L
Linus Torvalds 已提交
1935 1936
		goto dont_forward;

1937
	mrt = ipmr_rt_fib_lookup(net, skb);
1938 1939 1940
	if (IS_ERR(mrt)) {
		kfree_skb(skb);
		return PTR_ERR(mrt);
1941
	}
L
Linus Torvalds 已提交
1942
	if (!local) {
E
Eric Dumazet 已提交
1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960
		if (IPCB(skb)->opt.router_alert) {
			if (ip_call_ra_chain(skb))
				return 0;
		} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
			/* IGMPv1 (and broken IGMPv2 implementations sort of
			 * Cisco IOS <= 11.2(8)) do not put router alert
			 * option to IGMP packets destined to routable
			 * groups. It is very bad, because it means
			 * that we can forward NO IGMP messages.
			 */
			struct sock *mroute_sk;

			mroute_sk = rcu_dereference(mrt->mroute_sk);
			if (mroute_sk) {
				nf_reset(skb);
				raw_rcv(mroute_sk, skb);
				return 0;
			}
L
Linus Torvalds 已提交
1961 1962 1963
		    }
	}

1964
	/* already under rcu_read_lock() */
1965
	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1966
	if (!cache) {
1967 1968 1969 1970 1971 1972
		int vif = ipmr_find_vif(mrt, skb->dev);

		if (vif >= 0)
			cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
						    vif);
	}
L
Linus Torvalds 已提交
1973

1974
	/* No usable cache entry */
1975
	if (!cache) {
L
Linus Torvalds 已提交
1976 1977 1978 1979 1980
		int vif;

		if (local) {
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
			ip_local_deliver(skb);
1981
			if (!skb2)
L
Linus Torvalds 已提交
1982 1983 1984 1985
				return -ENOBUFS;
			skb = skb2;
		}

1986
		read_lock(&mrt_lock);
1987
		vif = ipmr_find_vif(mrt, skb->dev);
L
Linus Torvalds 已提交
1988
		if (vif >= 0) {
1989
			int err2 = ipmr_cache_unresolved(mrt, vif, skb);
L
Linus Torvalds 已提交
1990 1991
			read_unlock(&mrt_lock);

1992
			return err2;
L
Linus Torvalds 已提交
1993 1994 1995 1996 1997 1998
		}
		read_unlock(&mrt_lock);
		kfree_skb(skb);
		return -ENODEV;
	}

1999
	read_lock(&mrt_lock);
2000
	ip_mr_forward(net, mrt, skb, cache, local);
L
Linus Torvalds 已提交
2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014
	read_unlock(&mrt_lock);

	if (local)
		return ip_local_deliver(skb);

	return 0;

dont_forward:
	if (local)
		return ip_local_deliver(skb);
	kfree_skb(skb);
	return 0;
}

I
Ilpo Järvinen 已提交
2015
#ifdef CONFIG_IP_PIMSM_V1
2016
/* Handle IGMP messages of PIMv1 */
E
Eric Dumazet 已提交
2017
int pim_rcv_v1(struct sk_buff *skb)
I
Ilpo Järvinen 已提交
2018 2019
{
	struct igmphdr *pim;
2020
	struct net *net = dev_net(skb->dev);
2021
	struct mr_table *mrt;
I
Ilpo Järvinen 已提交
2022 2023 2024 2025 2026 2027

	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
		goto drop;

	pim = igmp_hdr(skb);

2028
	mrt = ipmr_rt_fib_lookup(net, skb);
2029 2030
	if (IS_ERR(mrt))
		goto drop;
2031
	if (!mrt->mroute_do_pim ||
I
Ilpo Järvinen 已提交
2032 2033 2034
	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
		goto drop;

2035
	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
I
Ilpo Järvinen 已提交
2036 2037 2038
drop:
		kfree_skb(skb);
	}
L
Linus Torvalds 已提交
2039 2040 2041 2042 2043
	return 0;
}
#endif

#ifdef CONFIG_IP_PIMSM_V2
E
Eric Dumazet 已提交
2044
static int pim_rcv(struct sk_buff *skb)
L
Linus Torvalds 已提交
2045 2046
{
	struct pimreghdr *pim;
2047 2048
	struct net *net = dev_net(skb->dev);
	struct mr_table *mrt;
L
Linus Torvalds 已提交
2049

I
Ilpo Järvinen 已提交
2050
	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
L
Linus Torvalds 已提交
2051 2052
		goto drop;

2053
	pim = (struct pimreghdr *)skb_transport_header(skb);
E
Eric Dumazet 已提交
2054 2055
	if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) ||
	    (pim->flags & PIM_NULL_REGISTER) ||
2056
	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
2057
	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
L
Linus Torvalds 已提交
2058 2059
		goto drop;

2060
	mrt = ipmr_rt_fib_lookup(net, skb);
2061 2062
	if (IS_ERR(mrt))
		goto drop;
2063
	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
I
Ilpo Järvinen 已提交
2064 2065 2066
drop:
		kfree_skb(skb);
	}
L
Linus Torvalds 已提交
2067 2068 2069 2070
	return 0;
}
#endif

2071 2072
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
			      struct mfc_cache *c, struct rtmsg *rtm)
L
Linus Torvalds 已提交
2073 2074 2075
{
	int ct;
	struct rtnexthop *nhp;
T
Thomas Graf 已提交
2076
	struct nlattr *mp_attr;
2077
	struct rta_mfc_stats mfcs;
L
Linus Torvalds 已提交
2078

2079
	/* If cache is unresolved, don't try to parse IIF and OIF */
2080
	if (c->mfc_parent >= MAXVIFS)
2081 2082
		return -ENOENT;

T
Thomas Graf 已提交
2083 2084 2085
	if (VIF_EXISTS(mrt, c->mfc_parent) &&
	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
		return -EMSGSIZE;
L
Linus Torvalds 已提交
2086

T
Thomas Graf 已提交
2087 2088
	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
		return -EMSGSIZE;
L
Linus Torvalds 已提交
2089 2090

	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
2091
		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
T
Thomas Graf 已提交
2092 2093 2094 2095 2096
			if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
				nla_nest_cancel(skb, mp_attr);
				return -EMSGSIZE;
			}

L
Linus Torvalds 已提交
2097 2098
			nhp->rtnh_flags = 0;
			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
2099
			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
L
Linus Torvalds 已提交
2100 2101 2102
			nhp->rtnh_len = sizeof(*nhp);
		}
	}
T
Thomas Graf 已提交
2103 2104 2105

	nla_nest_end(skb, mp_attr);

2106 2107 2108
	mfcs.mfcs_packets = c->mfc_un.res.pkt;
	mfcs.mfcs_bytes = c->mfc_un.res.bytes;
	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
2109
	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) < 0)
2110 2111
		return -EMSGSIZE;

L
Linus Torvalds 已提交
2112 2113 2114 2115
	rtm->rtm_type = RTN_MULTICAST;
	return 1;
}

2116 2117 2118
int ipmr_get_route(struct net *net, struct sk_buff *skb,
		   __be32 saddr, __be32 daddr,
		   struct rtmsg *rtm, int nowait)
L
Linus Torvalds 已提交
2119 2120
{
	struct mfc_cache *cache;
2121 2122
	struct mr_table *mrt;
	int err;
L
Linus Torvalds 已提交
2123

2124
	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2125
	if (!mrt)
2126 2127
		return -ENOENT;

2128
	rcu_read_lock();
2129
	cache = ipmr_cache_find(mrt, saddr, daddr);
2130
	if (!cache && skb->dev) {
2131
		int vif = ipmr_find_vif(mrt, skb->dev);
L
Linus Torvalds 已提交
2132

2133 2134 2135
		if (vif >= 0)
			cache = ipmr_cache_find_any(mrt, daddr, vif);
	}
2136
	if (!cache) {
2137
		struct sk_buff *skb2;
2138
		struct iphdr *iph;
L
Linus Torvalds 已提交
2139
		struct net_device *dev;
E
Eric Dumazet 已提交
2140
		int vif = -1;
L
Linus Torvalds 已提交
2141 2142

		if (nowait) {
2143
			rcu_read_unlock();
L
Linus Torvalds 已提交
2144 2145 2146 2147
			return -EAGAIN;
		}

		dev = skb->dev;
2148
		read_lock(&mrt_lock);
E
Eric Dumazet 已提交
2149 2150 2151
		if (dev)
			vif = ipmr_find_vif(mrt, dev);
		if (vif < 0) {
L
Linus Torvalds 已提交
2152
			read_unlock(&mrt_lock);
2153
			rcu_read_unlock();
L
Linus Torvalds 已提交
2154 2155
			return -ENODEV;
		}
2156 2157 2158
		skb2 = skb_clone(skb, GFP_ATOMIC);
		if (!skb2) {
			read_unlock(&mrt_lock);
2159
			rcu_read_unlock();
2160 2161 2162
			return -ENOMEM;
		}

2163 2164
		skb_push(skb2, sizeof(struct iphdr));
		skb_reset_network_header(skb2);
2165 2166
		iph = ip_hdr(skb2);
		iph->ihl = sizeof(struct iphdr) >> 2;
2167 2168
		iph->saddr = saddr;
		iph->daddr = daddr;
2169
		iph->version = 0;
2170
		err = ipmr_cache_unresolved(mrt, vif, skb2);
L
Linus Torvalds 已提交
2171
		read_unlock(&mrt_lock);
2172
		rcu_read_unlock();
L
Linus Torvalds 已提交
2173 2174 2175
		return err;
	}

2176
	read_lock(&mrt_lock);
2177
	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
L
Linus Torvalds 已提交
2178
	read_unlock(&mrt_lock);
2179
	rcu_read_unlock();
L
Linus Torvalds 已提交
2180 2181 2182
	return err;
}

2183
static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2184 2185
			    u32 portid, u32 seq, struct mfc_cache *c, int cmd,
			    int flags)
2186 2187 2188
{
	struct nlmsghdr *nlh;
	struct rtmsg *rtm;
2189
	int err;
2190

2191
	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
2192
	if (!nlh)
2193 2194 2195 2196 2197 2198 2199 2200
		return -EMSGSIZE;

	rtm = nlmsg_data(nlh);
	rtm->rtm_family   = RTNL_FAMILY_IPMR;
	rtm->rtm_dst_len  = 32;
	rtm->rtm_src_len  = 32;
	rtm->rtm_tos      = 0;
	rtm->rtm_table    = mrt->id;
D
David S. Miller 已提交
2201 2202
	if (nla_put_u32(skb, RTA_TABLE, mrt->id))
		goto nla_put_failure;
2203 2204
	rtm->rtm_type     = RTN_MULTICAST;
	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2205 2206 2207 2208
	if (c->mfc_flags & MFC_STATIC)
		rtm->rtm_protocol = RTPROT_STATIC;
	else
		rtm->rtm_protocol = RTPROT_MROUTED;
2209 2210
	rtm->rtm_flags    = 0;

2211 2212
	if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
	    nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
D
David S. Miller 已提交
2213
		goto nla_put_failure;
2214 2215 2216
	err = __ipmr_fill_mroute(mrt, skb, c, rtm);
	/* do not break the dump if cache is unresolved */
	if (err < 0 && err != -ENOENT)
2217 2218
		goto nla_put_failure;

2219 2220
	nlmsg_end(skb, nlh);
	return 0;
2221 2222 2223 2224 2225 2226

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241
static size_t mroute_msgsize(bool unresolved, int maxvif)
{
	size_t len =
		NLMSG_ALIGN(sizeof(struct rtmsg))
		+ nla_total_size(4)	/* RTA_TABLE */
		+ nla_total_size(4)	/* RTA_SRC */
		+ nla_total_size(4)	/* RTA_DST */
		;

	if (!unresolved)
		len = len
		      + nla_total_size(4)	/* RTA_IIF */
		      + nla_total_size(0)	/* RTA_MULTIPATH */
		      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
						/* RTA_MFC_STATS */
2242
		      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256
		;

	return len;
}

static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
				 int cmd)
{
	struct net *net = read_pnet(&mrt->net);
	struct sk_buff *skb;
	int err = -ENOBUFS;

	skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
			GFP_ATOMIC);
2257
	if (!skb)
2258 2259
		goto errout;

2260
	err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272
	if (err < 0)
		goto errout;

	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
	return;

errout:
	kfree_skb(skb);
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
}

2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
	struct net *net = sock_net(skb->sk);
	struct mr_table *mrt;
	struct mfc_cache *mfc;
	unsigned int t = 0, s_t;
	unsigned int h = 0, s_h;
	unsigned int e = 0, s_e;

	s_t = cb->args[0];
	s_h = cb->args[1];
	s_e = cb->args[2];

2286
	rcu_read_lock();
2287 2288 2289 2290 2291 2292
	ipmr_for_each_table(mrt, net) {
		if (t < s_t)
			goto next_table;
		if (t > s_t)
			s_h = 0;
		for (h = s_h; h < MFC_LINES; h++) {
2293
			list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) {
2294 2295 2296
				if (e < s_e)
					goto next_entry;
				if (ipmr_fill_mroute(mrt, skb,
2297
						     NETLINK_CB(cb->skb).portid,
2298
						     cb->nlh->nlmsg_seq,
2299 2300
						     mfc, RTM_NEWROUTE,
						     NLM_F_MULTI) < 0)
2301 2302 2303 2304 2305 2306
					goto done;
next_entry:
				e++;
			}
			e = s_e = 0;
		}
2307 2308 2309 2310 2311 2312 2313
		spin_lock_bh(&mfc_unres_lock);
		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
			if (e < s_e)
				goto next_entry2;
			if (ipmr_fill_mroute(mrt, skb,
					     NETLINK_CB(cb->skb).portid,
					     cb->nlh->nlmsg_seq,
2314 2315
					     mfc, RTM_NEWROUTE,
					     NLM_F_MULTI) < 0) {
2316 2317 2318 2319 2320 2321 2322 2323
				spin_unlock_bh(&mfc_unres_lock);
				goto done;
			}
next_entry2:
			e++;
		}
		spin_unlock_bh(&mfc_unres_lock);
		e = s_e = 0;
2324 2325 2326 2327 2328
		s_h = 0;
next_table:
		t++;
	}
done:
2329
	rcu_read_unlock();
2330 2331 2332 2333 2334 2335 2336 2337

	cb->args[2] = e;
	cb->args[1] = h;
	cb->args[0] = t;

	return skb->len;
}

2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461
static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
	[RTA_SRC]	= { .type = NLA_U32 },
	[RTA_DST]	= { .type = NLA_U32 },
	[RTA_IIF]	= { .type = NLA_U32 },
	[RTA_TABLE]	= { .type = NLA_U32 },
	[RTA_MULTIPATH]	= { .len = sizeof(struct rtnexthop) },
};

static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
{
	switch (rtm_protocol) {
	case RTPROT_STATIC:
	case RTPROT_MROUTED:
		return true;
	}
	return false;
}

static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
{
	struct rtnexthop *rtnh = nla_data(nla);
	int remaining = nla_len(nla), vifi = 0;

	while (rtnh_ok(rtnh, remaining)) {
		mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
		if (++vifi == MAXVIFS)
			break;
		rtnh = rtnh_next(rtnh, &remaining);
	}

	return remaining > 0 ? -EINVAL : vifi;
}

/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
			    struct mfcctl *mfcc, int *mrtsock,
			    struct mr_table **mrtret)
{
	struct net_device *dev = NULL;
	u32 tblid = RT_TABLE_DEFAULT;
	struct mr_table *mrt;
	struct nlattr *attr;
	struct rtmsg *rtm;
	int ret, rem;

	ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy);
	if (ret < 0)
		goto out;
	rtm = nlmsg_data(nlh);

	ret = -EINVAL;
	if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
	    rtm->rtm_type != RTN_MULTICAST ||
	    rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
	    !ipmr_rtm_validate_proto(rtm->rtm_protocol))
		goto out;

	memset(mfcc, 0, sizeof(*mfcc));
	mfcc->mfcc_parent = -1;
	ret = 0;
	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
		switch (nla_type(attr)) {
		case RTA_SRC:
			mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
			break;
		case RTA_DST:
			mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
			break;
		case RTA_IIF:
			dev = __dev_get_by_index(net, nla_get_u32(attr));
			if (!dev) {
				ret = -ENODEV;
				goto out;
			}
			break;
		case RTA_MULTIPATH:
			if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
				ret = -EINVAL;
				goto out;
			}
			break;
		case RTA_PREFSRC:
			ret = 1;
			break;
		case RTA_TABLE:
			tblid = nla_get_u32(attr);
			break;
		}
	}
	mrt = ipmr_get_table(net, tblid);
	if (!mrt) {
		ret = -ENOENT;
		goto out;
	}
	*mrtret = mrt;
	*mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
	if (dev)
		mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);

out:
	return ret;
}

/* takes care of both newroute and delroute */
static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh)
{
	struct net *net = sock_net(skb->sk);
	int ret, mrtsock, parent;
	struct mr_table *tbl;
	struct mfcctl mfcc;

	mrtsock = 0;
	tbl = NULL;
	ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl);
	if (ret < 0)
		return ret;

	parent = ret ? mfcc.mfcc_parent : -1;
	if (nlh->nlmsg_type == RTM_NEWROUTE)
		return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
	else
		return ipmr_mfc_delete(tbl, &mfcc, parent);
}

2462
#ifdef CONFIG_PROC_FS
2463 2464
/* The /proc interfaces to multicast routing :
 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
L
Linus Torvalds 已提交
2465 2466
 */
struct ipmr_vif_iter {
2467
	struct seq_net_private p;
2468
	struct mr_table *mrt;
L
Linus Torvalds 已提交
2469 2470 2471
	int ct;
};

2472 2473
static struct vif_device *ipmr_vif_seq_idx(struct net *net,
					   struct ipmr_vif_iter *iter,
L
Linus Torvalds 已提交
2474 2475
					   loff_t pos)
{
2476
	struct mr_table *mrt = iter->mrt;
2477 2478 2479

	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
		if (!VIF_EXISTS(mrt, iter->ct))
L
Linus Torvalds 已提交
2480
			continue;
2481
		if (pos-- == 0)
2482
			return &mrt->vif_table[iter->ct];
L
Linus Torvalds 已提交
2483 2484 2485 2486 2487
	}
	return NULL;
}

static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
S
Stephen Hemminger 已提交
2488
	__acquires(mrt_lock)
L
Linus Torvalds 已提交
2489
{
2490
	struct ipmr_vif_iter *iter = seq->private;
2491
	struct net *net = seq_file_net(seq);
2492 2493 2494
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2495
	if (!mrt)
2496 2497 2498
		return ERR_PTR(-ENOENT);

	iter->mrt = mrt;
2499

L
Linus Torvalds 已提交
2500
	read_lock(&mrt_lock);
2501
	return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
L
Linus Torvalds 已提交
2502 2503 2504 2505 2506 2507
		: SEQ_START_TOKEN;
}

static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct ipmr_vif_iter *iter = seq->private;
2508
	struct net *net = seq_file_net(seq);
2509
	struct mr_table *mrt = iter->mrt;
L
Linus Torvalds 已提交
2510 2511 2512

	++*pos;
	if (v == SEQ_START_TOKEN)
2513
		return ipmr_vif_seq_idx(net, iter, 0);
2514

2515 2516
	while (++iter->ct < mrt->maxvif) {
		if (!VIF_EXISTS(mrt, iter->ct))
L
Linus Torvalds 已提交
2517
			continue;
2518
		return &mrt->vif_table[iter->ct];
L
Linus Torvalds 已提交
2519 2520 2521 2522 2523
	}
	return NULL;
}

static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
S
Stephen Hemminger 已提交
2524
	__releases(mrt_lock)
L
Linus Torvalds 已提交
2525 2526 2527 2528 2529 2530
{
	read_unlock(&mrt_lock);
}

static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
{
2531 2532
	struct ipmr_vif_iter *iter = seq->private;
	struct mr_table *mrt = iter->mrt;
2533

L
Linus Torvalds 已提交
2534
	if (v == SEQ_START_TOKEN) {
2535
		seq_puts(seq,
L
Linus Torvalds 已提交
2536 2537 2538 2539 2540 2541 2542
			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
	} else {
		const struct vif_device *vif = v;
		const char *name =  vif->dev ? vif->dev->name : "none";

		seq_printf(seq,
			   "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2543
			   vif - mrt->vif_table,
2544
			   name, vif->bytes_in, vif->pkt_in,
L
Linus Torvalds 已提交
2545 2546 2547 2548 2549 2550
			   vif->bytes_out, vif->pkt_out,
			   vif->flags, vif->local, vif->remote);
	}
	return 0;
}

2551
static const struct seq_operations ipmr_vif_seq_ops = {
L
Linus Torvalds 已提交
2552 2553 2554 2555 2556 2557 2558 2559
	.start = ipmr_vif_seq_start,
	.next  = ipmr_vif_seq_next,
	.stop  = ipmr_vif_seq_stop,
	.show  = ipmr_vif_seq_show,
};

static int ipmr_vif_open(struct inode *inode, struct file *file)
{
2560 2561
	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
			    sizeof(struct ipmr_vif_iter));
L
Linus Torvalds 已提交
2562 2563
}

2564
static const struct file_operations ipmr_vif_fops = {
L
Linus Torvalds 已提交
2565 2566 2567 2568
	.owner	 = THIS_MODULE,
	.open    = ipmr_vif_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
2569
	.release = seq_release_net,
L
Linus Torvalds 已提交
2570 2571 2572
};

struct ipmr_mfc_iter {
2573
	struct seq_net_private p;
2574
	struct mr_table *mrt;
2575
	struct list_head *cache;
L
Linus Torvalds 已提交
2576 2577 2578 2579
	int ct;
};


2580 2581
static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
					  struct ipmr_mfc_iter *it, loff_t pos)
L
Linus Torvalds 已提交
2582
{
2583
	struct mr_table *mrt = it->mrt;
L
Linus Torvalds 已提交
2584 2585
	struct mfc_cache *mfc;

2586
	rcu_read_lock();
2587
	for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2588
		it->cache = &mrt->mfc_cache_array[it->ct];
2589
		list_for_each_entry_rcu(mfc, it->cache, list)
2590
			if (pos-- == 0)
L
Linus Torvalds 已提交
2591
				return mfc;
2592
	}
2593
	rcu_read_unlock();
L
Linus Torvalds 已提交
2594 2595

	spin_lock_bh(&mfc_unres_lock);
2596
	it->cache = &mrt->mfc_unres_queue;
2597
	list_for_each_entry(mfc, it->cache, list)
2598
		if (pos-- == 0)
L
Linus Torvalds 已提交
2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609
			return mfc;
	spin_unlock_bh(&mfc_unres_lock);

	it->cache = NULL;
	return NULL;
}


static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
{
	struct ipmr_mfc_iter *it = seq->private;
2610
	struct net *net = seq_file_net(seq);
2611
	struct mr_table *mrt;
2612

2613
	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2614
	if (!mrt)
2615
		return ERR_PTR(-ENOENT);
2616

2617
	it->mrt = mrt;
L
Linus Torvalds 已提交
2618 2619
	it->cache = NULL;
	it->ct = 0;
2620
	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
L
Linus Torvalds 已提交
2621 2622 2623 2624 2625 2626 2627
		: SEQ_START_TOKEN;
}

static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct mfc_cache *mfc = v;
	struct ipmr_mfc_iter *it = seq->private;
2628
	struct net *net = seq_file_net(seq);
2629
	struct mr_table *mrt = it->mrt;
L
Linus Torvalds 已提交
2630 2631 2632 2633

	++*pos;

	if (v == SEQ_START_TOKEN)
2634
		return ipmr_mfc_seq_idx(net, seq->private, 0);
L
Linus Torvalds 已提交
2635

2636 2637
	if (mfc->list.next != it->cache)
		return list_entry(mfc->list.next, struct mfc_cache, list);
2638

2639
	if (it->cache == &mrt->mfc_unres_queue)
L
Linus Torvalds 已提交
2640 2641
		goto end_of_list;

2642
	BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
L
Linus Torvalds 已提交
2643 2644

	while (++it->ct < MFC_LINES) {
2645
		it->cache = &mrt->mfc_cache_array[it->ct];
2646 2647 2648
		if (list_empty(it->cache))
			continue;
		return list_first_entry(it->cache, struct mfc_cache, list);
L
Linus Torvalds 已提交
2649 2650 2651
	}

	/* exhausted cache_array, show unresolved */
2652
	rcu_read_unlock();
2653
	it->cache = &mrt->mfc_unres_queue;
L
Linus Torvalds 已提交
2654
	it->ct = 0;
2655

L
Linus Torvalds 已提交
2656
	spin_lock_bh(&mfc_unres_lock);
2657 2658
	if (!list_empty(it->cache))
		return list_first_entry(it->cache, struct mfc_cache, list);
L
Linus Torvalds 已提交
2659

E
Eric Dumazet 已提交
2660
end_of_list:
L
Linus Torvalds 已提交
2661 2662 2663 2664 2665 2666 2667 2668 2669
	spin_unlock_bh(&mfc_unres_lock);
	it->cache = NULL;

	return NULL;
}

static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
{
	struct ipmr_mfc_iter *it = seq->private;
2670
	struct mr_table *mrt = it->mrt;
L
Linus Torvalds 已提交
2671

2672
	if (it->cache == &mrt->mfc_unres_queue)
L
Linus Torvalds 已提交
2673
		spin_unlock_bh(&mfc_unres_lock);
2674
	else if (it->cache == &mrt->mfc_cache_array[it->ct])
2675
		rcu_read_unlock();
L
Linus Torvalds 已提交
2676 2677 2678 2679 2680 2681 2682
}

static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
{
	int n;

	if (v == SEQ_START_TOKEN) {
2683
		seq_puts(seq,
L
Linus Torvalds 已提交
2684 2685 2686 2687
		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
	} else {
		const struct mfc_cache *mfc = v;
		const struct ipmr_mfc_iter *it = seq->private;
2688
		const struct mr_table *mrt = it->mrt;
2689

2690 2691 2692
		seq_printf(seq, "%08X %08X %-3hd",
			   (__force u32) mfc->mfc_mcastgrp,
			   (__force u32) mfc->mfc_origin,
2693
			   mfc->mfc_parent);
L
Linus Torvalds 已提交
2694

2695
		if (it->cache != &mrt->mfc_unres_queue) {
2696 2697 2698 2699
			seq_printf(seq, " %8lu %8lu %8lu",
				   mfc->mfc_un.res.pkt,
				   mfc->mfc_un.res.bytes,
				   mfc->mfc_un.res.wrong_if);
S
Stephen Hemminger 已提交
2700
			for (n = mfc->mfc_un.res.minvif;
E
Eric Dumazet 已提交
2701
			     n < mfc->mfc_un.res.maxvif; n++) {
2702
				if (VIF_EXISTS(mrt, n) &&
2703 2704
				    mfc->mfc_un.res.ttls[n] < 255)
					seq_printf(seq,
2705
					   " %2d:%-3d",
L
Linus Torvalds 已提交
2706 2707
					   n, mfc->mfc_un.res.ttls[n]);
			}
2708 2709 2710 2711 2712
		} else {
			/* unresolved mfc_caches don't contain
			 * pkt, bytes and wrong_if values
			 */
			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
L
Linus Torvalds 已提交
2713 2714 2715 2716 2717 2718
		}
		seq_putc(seq, '\n');
	}
	return 0;
}

2719
static const struct seq_operations ipmr_mfc_seq_ops = {
L
Linus Torvalds 已提交
2720 2721 2722 2723 2724 2725 2726 2727
	.start = ipmr_mfc_seq_start,
	.next  = ipmr_mfc_seq_next,
	.stop  = ipmr_mfc_seq_stop,
	.show  = ipmr_mfc_seq_show,
};

static int ipmr_mfc_open(struct inode *inode, struct file *file)
{
2728 2729
	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
			    sizeof(struct ipmr_mfc_iter));
L
Linus Torvalds 已提交
2730 2731
}

2732
static const struct file_operations ipmr_mfc_fops = {
L
Linus Torvalds 已提交
2733 2734 2735 2736
	.owner	 = THIS_MODULE,
	.open    = ipmr_mfc_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
2737
	.release = seq_release_net,
L
Linus Torvalds 已提交
2738
};
2739
#endif
L
Linus Torvalds 已提交
2740 2741

#ifdef CONFIG_IP_PIMSM_V2
2742
static const struct net_protocol pim_protocol = {
L
Linus Torvalds 已提交
2743
	.handler	=	pim_rcv,
T
Tom Goff 已提交
2744
	.netns_ok	=	1,
L
Linus Torvalds 已提交
2745 2746 2747
};
#endif

2748
/* Setup for IP multicast routing */
2749 2750
static int __net_init ipmr_net_init(struct net *net)
{
2751
	int err;
2752

2753 2754
	err = ipmr_rules_init(net);
	if (err < 0)
2755
		goto fail;
2756 2757 2758

#ifdef CONFIG_PROC_FS
	err = -ENOMEM;
2759
	if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
2760
		goto proc_vif_fail;
2761
	if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
2762 2763
		goto proc_cache_fail;
#endif
2764 2765
	return 0;

2766 2767
#ifdef CONFIG_PROC_FS
proc_cache_fail:
2768
	remove_proc_entry("ip_mr_vif", net->proc_net);
2769
proc_vif_fail:
2770
	ipmr_rules_exit(net);
2771
#endif
2772 2773 2774 2775 2776 2777
fail:
	return err;
}

static void __net_exit ipmr_net_exit(struct net *net)
{
2778
#ifdef CONFIG_PROC_FS
2779 2780
	remove_proc_entry("ip_mr_cache", net->proc_net);
	remove_proc_entry("ip_mr_vif", net->proc_net);
2781
#endif
2782
	ipmr_rules_exit(net);
2783 2784 2785 2786 2787 2788
}

static struct pernet_operations ipmr_net_ops = {
	.init = ipmr_net_init,
	.exit = ipmr_net_exit,
};
2789

W
Wang Chen 已提交
2790
int __init ip_mr_init(void)
L
Linus Torvalds 已提交
2791
{
W
Wang Chen 已提交
2792 2793
	int err;

L
Linus Torvalds 已提交
2794 2795
	mrt_cachep = kmem_cache_create("ip_mrt_cache",
				       sizeof(struct mfc_cache),
2796
				       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
2797
				       NULL);
W
Wang Chen 已提交
2798

2799 2800 2801 2802
	err = register_pernet_subsys(&ipmr_net_ops);
	if (err)
		goto reg_pernet_fail;

W
Wang Chen 已提交
2803 2804 2805
	err = register_netdevice_notifier(&ip_mr_notifier);
	if (err)
		goto reg_notif_fail;
T
Tom Goff 已提交
2806 2807
#ifdef CONFIG_IP_PIMSM_V2
	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
J
Joe Perches 已提交
2808
		pr_err("%s: can't add PIM protocol\n", __func__);
T
Tom Goff 已提交
2809 2810 2811 2812
		err = -EAGAIN;
		goto add_proto_fail;
	}
#endif
2813 2814
	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
		      NULL, ipmr_rtm_dumproute, NULL);
2815 2816 2817 2818
	rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
		      ipmr_rtm_route, NULL, NULL);
	rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
		      ipmr_rtm_route, NULL, NULL);
W
Wang Chen 已提交
2819
	return 0;
2820

T
Tom Goff 已提交
2821 2822 2823 2824
#ifdef CONFIG_IP_PIMSM_V2
add_proto_fail:
	unregister_netdevice_notifier(&ip_mr_notifier);
#endif
B
Benjamin Thery 已提交
2825
reg_notif_fail:
2826 2827
	unregister_pernet_subsys(&ipmr_net_ops);
reg_pernet_fail:
B
Benjamin Thery 已提交
2828
	kmem_cache_destroy(mrt_cachep);
W
Wang Chen 已提交
2829
	return err;
L
Linus Torvalds 已提交
2830
}