ipmr.c 76.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 *	IP multicast routing support for mrouted 3.6/3.8
 *
4
 *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 *	  Linux Consultancy and Custom Driver Development
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 *
 *	Fixes:
 *	Michael Chastain	:	Incorrect size of copying.
 *	Alan Cox		:	Added the cache manager code
 *	Alan Cox		:	Fixed the clone/copy bug and device race.
 *	Mike McLagan		:	Routing by source
 *	Malcolm Beattie		:	Buffer handling fixes.
 *	Alexey Kuznetsov	:	Double buffer free and other fixes.
 *	SVR Anand		:	Fixed several multicast bugs and problems.
 *	Alexey Kuznetsov	:	Status, optimisations and more.
 *	Brad Parker		:	Better behaviour on mrouted upcall
 *					overflow.
 *      Carlos Picoto           :       PIMv1 Support
 *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
25
 *					Relax this requirement to work with older peers.
L
Linus Torvalds 已提交
26 27 28
 *
 */

29
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
30
#include <linux/types.h>
31
#include <linux/capability.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
#include <linux/errno.h>
#include <linux/timer.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mroute.h>
#include <linux/init.h>
48
#include <linux/if_ether.h>
49
#include <linux/slab.h>
50
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
51 52 53
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
54
#include <net/route.h>
L
Linus Torvalds 已提交
55 56 57 58 59 60 61
#include <net/sock.h>
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
62
#include <linux/compat.h>
63
#include <linux/export.h>
64
#include <net/ip_tunnels.h>
L
Linus Torvalds 已提交
65
#include <net/checksum.h>
66
#include <net/netlink.h>
67
#include <net/fib_rules.h>
68
#include <linux/netconf.h>
69
#include <net/nexthop.h>
L
Linus Torvalds 已提交
70

71 72 73 74 75 76 77 78
struct ipmr_rule {
	struct fib_rule		common;
};

struct ipmr_result {
	struct mr_table		*mrt;
};

L
Linus Torvalds 已提交
79
/* Big lock, protecting vif table, mrt cache and mroute socket state.
E
Eric Dumazet 已提交
80
 * Note that the changes are semaphored via rtnl_lock.
L
Linus Torvalds 已提交
81 82 83 84
 */

static DEFINE_RWLOCK(mrt_lock);

85
/* Multicast router control variables */
L
Linus Torvalds 已提交
86 87 88 89 90

/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);

/* We return to original Alan's scheme. Hash table of resolved
E
Eric Dumazet 已提交
91 92 93 94 95
 * entries is changed only in process context and protected
 * with weak lock mrt_lock. Queue of unresolved entries is protected
 * with strong spinlock mfc_unres_lock.
 *
 * In this case data path is free of exclusive locks at all.
L
Linus Torvalds 已提交
96 97
 */

98
static struct kmem_cache *mrt_cachep __read_mostly;
L
Linus Torvalds 已提交
99

100
static struct mr_table *ipmr_new_table(struct net *net, u32 id);
101 102
static void ipmr_free_table(struct mr_table *mrt);

103
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
104 105
			  struct net_device *dev, struct sk_buff *skb,
			  struct mfc_cache *cache, int local);
106
static int ipmr_cache_report(struct mr_table *mrt,
107
			     struct sk_buff *pkt, vifi_t vifi, int assert);
108 109
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
			      struct mfc_cache *c, struct rtmsg *rtm);
110 111
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
				 int cmd);
112
static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
113
static void mroute_clean_tables(struct mr_table *mrt, bool all);
114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
static void ipmr_expire_process(unsigned long arg);

#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
#define ipmr_for_each_table(mrt, net) \
	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)

static struct mr_table *ipmr_get_table(struct net *net, u32 id)
{
	struct mr_table *mrt;

	ipmr_for_each_table(mrt, net) {
		if (mrt->id == id)
			return mrt;
	}
	return NULL;
}

D
David S. Miller 已提交
131
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
132 133 134
			   struct mr_table **mrt)
{
	int err;
135 136 137 138 139
	struct ipmr_result res;
	struct fib_lookup_arg arg = {
		.result = &res,
		.flags = FIB_LOOKUP_NOREF,
	};
140

141 142 143
	/* update flow if oif or iif point to device enslaved to l3mdev */
	l3mdev_update_flow(net, flowi4_to_flowi(flp4));

D
David S. Miller 已提交
144 145
	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
			       flowi4_to_flowi(flp4), 0, &arg);
146 147 148 149 150 151 152 153 154 155 156
	if (err < 0)
		return err;
	*mrt = res.mrt;
	return 0;
}

static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
			    int flags, struct fib_lookup_arg *arg)
{
	struct ipmr_result *res = arg->result;
	struct mr_table *mrt;
L
Linus Torvalds 已提交
157

158 159 160 161 162 163 164 165 166 167 168 169
	switch (rule->action) {
	case FR_ACT_TO_TBL:
		break;
	case FR_ACT_UNREACHABLE:
		return -ENETUNREACH;
	case FR_ACT_PROHIBIT:
		return -EACCES;
	case FR_ACT_BLACKHOLE:
	default:
		return -EINVAL;
	}

170 171 172
	arg->table = fib_rule_get_table(rule, arg);

	mrt = ipmr_get_table(rule->fr_net, arg->table);
173
	if (!mrt)
174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
		return -EAGAIN;
	res->mrt = mrt;
	return 0;
}

static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
	return 1;
}

static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
	FRA_GENERIC_POLICY,
};

static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
			       struct fib_rule_hdr *frh, struct nlattr **tb)
{
	return 0;
}

static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
			     struct nlattr **tb)
{
	return 1;
}

static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
			  struct fib_rule_hdr *frh)
{
	frh->dst_len = 0;
	frh->src_len = 0;
	frh->tos     = 0;
	return 0;
}

209
static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
210
	.family		= RTNL_FAMILY_IPMR,
211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235
	.rule_size	= sizeof(struct ipmr_rule),
	.addr_size	= sizeof(u32),
	.action		= ipmr_rule_action,
	.match		= ipmr_rule_match,
	.configure	= ipmr_rule_configure,
	.compare	= ipmr_rule_compare,
	.fill		= ipmr_rule_fill,
	.nlgroup	= RTNLGRP_IPV4_RULE,
	.policy		= ipmr_rule_policy,
	.owner		= THIS_MODULE,
};

static int __net_init ipmr_rules_init(struct net *net)
{
	struct fib_rules_ops *ops;
	struct mr_table *mrt;
	int err;

	ops = fib_rules_register(&ipmr_rules_ops_template, net);
	if (IS_ERR(ops))
		return PTR_ERR(ops);

	INIT_LIST_HEAD(&net->ipv4.mr_tables);

	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
236 237
	if (IS_ERR(mrt)) {
		err = PTR_ERR(mrt);
238 239 240 241 242 243 244 245 246 247 248
		goto err1;
	}

	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
	if (err < 0)
		goto err2;

	net->ipv4.mr_rules_ops = ops;
	return 0;

err2:
249
	ipmr_free_table(mrt);
250 251 252 253 254 255 256 257 258
err1:
	fib_rules_unregister(ops);
	return err;
}

static void __net_exit ipmr_rules_exit(struct net *net)
{
	struct mr_table *mrt, *next;

259
	rtnl_lock();
E
Eric Dumazet 已提交
260 261
	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
		list_del(&mrt->list);
262
		ipmr_free_table(mrt);
E
Eric Dumazet 已提交
263
	}
264
	fib_rules_unregister(net->ipv4.mr_rules_ops);
265
	rtnl_unlock();
266
}
267 268 269 270 271 272 273 274 275 276

static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
{
	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
}

static unsigned int ipmr_rules_seq_read(struct net *net)
{
	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
}
277 278 279 280 281 282 283 284 285
#else
#define ipmr_for_each_table(mrt, net) \
	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)

static struct mr_table *ipmr_get_table(struct net *net, u32 id)
{
	return net->ipv4.mrt;
}

D
David S. Miller 已提交
286
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
287 288 289 290 291 292 293 294
			   struct mr_table **mrt)
{
	*mrt = net->ipv4.mrt;
	return 0;
}

static int __net_init ipmr_rules_init(struct net *net)
{
295 296 297 298 299 300 301
	struct mr_table *mrt;

	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
	if (IS_ERR(mrt))
		return PTR_ERR(mrt);
	net->ipv4.mrt = mrt;
	return 0;
302 303 304 305
}

static void __net_exit ipmr_rules_exit(struct net *net)
{
306
	rtnl_lock();
307
	ipmr_free_table(net->ipv4.mrt);
308 309
	net->ipv4.mrt = NULL;
	rtnl_unlock();
310
}
311 312 313 314 315 316 317 318 319 320

static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
{
	return 0;
}

static unsigned int ipmr_rules_seq_read(struct net *net)
{
	return 0;
}
321 322
#endif

323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342
static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
				const void *ptr)
{
	const struct mfc_cache_cmp_arg *cmparg = arg->key;
	struct mfc_cache *c = (struct mfc_cache *)ptr;

	return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
	       cmparg->mfc_origin != c->mfc_origin;
}

static const struct rhashtable_params ipmr_rht_params = {
	.head_offset = offsetof(struct mfc_cache, mnode),
	.key_offset = offsetof(struct mfc_cache, cmparg),
	.key_len = sizeof(struct mfc_cache_cmp_arg),
	.nelem_hint = 3,
	.locks_mul = 1,
	.obj_cmpfn = ipmr_hash_cmp,
	.automatic_shrinking = true,
};

343 344 345
static struct mr_table *ipmr_new_table(struct net *net, u32 id)
{
	struct mr_table *mrt;
L
Linus Torvalds 已提交
346

347 348 349 350
	/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
	if (id != RT_TABLE_DEFAULT && id >= 1000000000)
		return ERR_PTR(-EINVAL);

351
	mrt = ipmr_get_table(net, id);
352
	if (mrt)
353 354 355
		return mrt;

	mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
356
	if (!mrt)
357
		return ERR_PTR(-ENOMEM);
358
	write_pnet(&mrt->net, net);
359 360
	mrt->id = id;

361 362
	rhltable_init(&mrt->mfc_hash, &ipmr_rht_params);
	INIT_LIST_HEAD(&mrt->mfc_cache_list);
363 364 365 366 367 368 369 370 371 372 373
	INIT_LIST_HEAD(&mrt->mfc_unres_queue);

	setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
		    (unsigned long)mrt);

	mrt->mroute_reg_vif_num = -1;
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
#endif
	return mrt;
}
L
Linus Torvalds 已提交
374

375 376 377
static void ipmr_free_table(struct mr_table *mrt)
{
	del_timer_sync(&mrt->ipmr_expire_timer);
378
	mroute_clean_tables(mrt, true);
379
	rhltable_destroy(&mrt->mfc_hash);
380 381 382
	kfree(mrt);
}

L
Linus Torvalds 已提交
383 384
/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */

385 386
static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
{
387 388
	struct net *net = dev_net(dev);

389 390
	dev_close(dev);

391
	dev = __dev_get_by_name(net, "tunl0");
392
	if (dev) {
393
		const struct net_device_ops *ops = dev->netdev_ops;
394 395 396 397 398 399 400 401 402 403 404 405
		struct ifreq ifr;
		struct ip_tunnel_parm p;

		memset(&p, 0, sizeof(p));
		p.iph.daddr = v->vifc_rmt_addr.s_addr;
		p.iph.saddr = v->vifc_lcl_addr.s_addr;
		p.iph.version = 4;
		p.iph.ihl = 5;
		p.iph.protocol = IPPROTO_IPIP;
		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;

406 407 408 409 410 411 412
		if (ops->ndo_do_ioctl) {
			mm_segment_t oldfs = get_fs();

			set_fs(KERNEL_DS);
			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
			set_fs(oldfs);
		}
413 414 415
	}
}

416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432
/* Initialize ipmr pimreg/tunnel in_device */
static bool ipmr_init_vif_indev(const struct net_device *dev)
{
	struct in_device *in_dev;

	ASSERT_RTNL();

	in_dev = __in_dev_get_rtnl(dev);
	if (!in_dev)
		return false;
	ipv4_devconf_setall(in_dev);
	neigh_parms_data_state_setall(in_dev->arp_parms);
	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;

	return true;
}

433
static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
L
Linus Torvalds 已提交
434 435 436
{
	struct net_device  *dev;

437
	dev = __dev_get_by_name(net, "tunl0");
L
Linus Torvalds 已提交
438 439

	if (dev) {
440
		const struct net_device_ops *ops = dev->netdev_ops;
L
Linus Torvalds 已提交
441 442 443 444 445 446 447 448 449 450 451
		int err;
		struct ifreq ifr;
		struct ip_tunnel_parm p;

		memset(&p, 0, sizeof(p));
		p.iph.daddr = v->vifc_rmt_addr.s_addr;
		p.iph.saddr = v->vifc_lcl_addr.s_addr;
		p.iph.version = 4;
		p.iph.ihl = 5;
		p.iph.protocol = IPPROTO_IPIP;
		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
S
Stephen Hemminger 已提交
452
		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
L
Linus Torvalds 已提交
453

454 455 456 457 458 459
		if (ops->ndo_do_ioctl) {
			mm_segment_t oldfs = get_fs();

			set_fs(KERNEL_DS);
			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
			set_fs(oldfs);
E
Eric Dumazet 已提交
460
		} else {
461
			err = -EOPNOTSUPP;
E
Eric Dumazet 已提交
462
		}
L
Linus Torvalds 已提交
463 464
		dev = NULL;

465 466
		if (err == 0 &&
		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
L
Linus Torvalds 已提交
467
			dev->flags |= IFF_MULTICAST;
468
			if (!ipmr_init_vif_indev(dev))
L
Linus Torvalds 已提交
469 470 471
				goto failure;
			if (dev_open(dev))
				goto failure;
472
			dev_hold(dev);
L
Linus Torvalds 已提交
473 474 475 476 477 478 479 480 481
		}
	}
	return dev;

failure:
	unregister_netdevice(dev);
	return NULL;
}

482
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
483
static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
484
{
485
	struct net *net = dev_net(dev);
486
	struct mr_table *mrt;
D
David S. Miller 已提交
487 488
	struct flowi4 fl4 = {
		.flowi4_oif	= dev->ifindex,
489
		.flowi4_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,
D
David S. Miller 已提交
490
		.flowi4_mark	= skb->mark,
491 492 493
	};
	int err;

D
David S. Miller 已提交
494
	err = ipmr_fib_lookup(net, &fl4, &mrt);
495 496
	if (err < 0) {
		kfree_skb(skb);
497
		return err;
498
	}
499

L
Linus Torvalds 已提交
500
	read_lock(&mrt_lock);
501 502
	dev->stats.tx_bytes += skb->len;
	dev->stats.tx_packets++;
503
	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
L
Linus Torvalds 已提交
504 505
	read_unlock(&mrt_lock);
	kfree_skb(skb);
506
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
507 508
}

509 510 511 512 513
static int reg_vif_get_iflink(const struct net_device *dev)
{
	return 0;
}

514 515
static const struct net_device_ops reg_vif_netdev_ops = {
	.ndo_start_xmit	= reg_vif_xmit,
516
	.ndo_get_iflink = reg_vif_get_iflink,
517 518
};

L
Linus Torvalds 已提交
519 520 521
static void reg_vif_setup(struct net_device *dev)
{
	dev->type		= ARPHRD_PIMREG;
522
	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
L
Linus Torvalds 已提交
523
	dev->flags		= IFF_NOARP;
524
	dev->netdev_ops		= &reg_vif_netdev_ops;
525
	dev->needs_free_netdev	= true;
T
Tom Goff 已提交
526
	dev->features		|= NETIF_F_NETNS_LOCAL;
L
Linus Torvalds 已提交
527 528
}

529
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
L
Linus Torvalds 已提交
530 531
{
	struct net_device *dev;
532
	char name[IFNAMSIZ];
L
Linus Torvalds 已提交
533

534 535 536 537
	if (mrt->id == RT_TABLE_DEFAULT)
		sprintf(name, "pimreg");
	else
		sprintf(name, "pimreg%u", mrt->id);
L
Linus Torvalds 已提交
538

539
	dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
L
Linus Torvalds 已提交
540

541
	if (!dev)
L
Linus Torvalds 已提交
542 543
		return NULL;

T
Tom Goff 已提交
544 545
	dev_net_set(dev, net);

L
Linus Torvalds 已提交
546 547 548 549 550
	if (register_netdevice(dev)) {
		free_netdev(dev);
		return NULL;
	}

551
	if (!ipmr_init_vif_indev(dev))
L
Linus Torvalds 已提交
552 553 554 555
		goto failure;
	if (dev_open(dev))
		goto failure;

556 557
	dev_hold(dev);

L
Linus Torvalds 已提交
558 559 560 561 562 563
	return dev;

failure:
	unregister_netdevice(dev);
	return NULL;
}
564 565 566 567 568 569 570 571 572

/* called with rcu_read_lock() */
static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
		     unsigned int pimlen)
{
	struct net_device *reg_dev = NULL;
	struct iphdr *encap;

	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
573
	/* Check that:
574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607
	 * a. packet is really sent to a multicast group
	 * b. packet is not a NULL-REGISTER
	 * c. packet is not truncated
	 */
	if (!ipv4_is_multicast(encap->daddr) ||
	    encap->tot_len == 0 ||
	    ntohs(encap->tot_len) + pimlen > skb->len)
		return 1;

	read_lock(&mrt_lock);
	if (mrt->mroute_reg_vif_num >= 0)
		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
	read_unlock(&mrt_lock);

	if (!reg_dev)
		return 1;

	skb->mac_header = skb->network_header;
	skb_pull(skb, (u8 *)encap - skb->data);
	skb_reset_network_header(skb);
	skb->protocol = htons(ETH_P_IP);
	skb->ip_summed = CHECKSUM_NONE;

	skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));

	netif_rx(skb);

	return NET_RX_SUCCESS;
}
#else
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
	return NULL;
}
L
Linus Torvalds 已提交
608 609
#endif

610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646
static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
					struct net *net,
					enum fib_event_type event_type,
					struct vif_device *vif,
					vifi_t vif_index, u32 tb_id)
{
	struct vif_entry_notifier_info info = {
		.info = {
			.family = RTNL_FAMILY_IPMR,
			.net = net,
		},
		.dev = vif->dev,
		.vif_index = vif_index,
		.vif_flags = vif->flags,
		.tb_id = tb_id,
	};

	return call_fib_notifier(nb, net, event_type, &info.info);
}

static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
					struct net *net,
					enum fib_event_type event_type,
					struct mfc_cache *mfc, u32 tb_id)
{
	struct mfc_entry_notifier_info info = {
		.info = {
			.family = RTNL_FAMILY_IPMR,
			.net = net,
		},
		.mfc = mfc,
		.tb_id = tb_id
	};

	return call_fib_notifier(nb, net, event_type, &info.info);
}

647 648
/**
 *	vif_delete - Delete a VIF entry
649
 *	@notify: Set to 1, if the caller is a notifier_call
L
Linus Torvalds 已提交
650
 */
651
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
652
		      struct list_head *head)
L
Linus Torvalds 已提交
653 654 655 656 657
{
	struct vif_device *v;
	struct net_device *dev;
	struct in_device *in_dev;

658
	if (vifi < 0 || vifi >= mrt->maxvif)
L
Linus Torvalds 已提交
659 660
		return -EADDRNOTAVAIL;

661
	v = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
662 663 664 665 666 667 668 669 670 671

	write_lock_bh(&mrt_lock);
	dev = v->dev;
	v->dev = NULL;

	if (!dev) {
		write_unlock_bh(&mrt_lock);
		return -EADDRNOTAVAIL;
	}

672 673
	if (vifi == mrt->mroute_reg_vif_num)
		mrt->mroute_reg_vif_num = -1;
L
Linus Torvalds 已提交
674

E
Eric Dumazet 已提交
675
	if (vifi + 1 == mrt->maxvif) {
L
Linus Torvalds 已提交
676
		int tmp;
E
Eric Dumazet 已提交
677 678

		for (tmp = vifi - 1; tmp >= 0; tmp--) {
679
			if (VIF_EXISTS(mrt, tmp))
L
Linus Torvalds 已提交
680 681
				break;
		}
682
		mrt->maxvif = tmp+1;
L
Linus Torvalds 已提交
683 684 685 686 687 688
	}

	write_unlock_bh(&mrt_lock);

	dev_set_allmulti(dev, -1);

E
Eric Dumazet 已提交
689 690
	in_dev = __in_dev_get_rtnl(dev);
	if (in_dev) {
691
		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
692
		inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
693 694
					    NETCONFA_MC_FORWARDING,
					    dev->ifindex, &in_dev->cnf);
L
Linus Torvalds 已提交
695 696 697
		ip_rt_multicast_event(in_dev);
	}

E
Eric Dumazet 已提交
698
	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
699
		unregister_netdevice_queue(dev, head);
L
Linus Torvalds 已提交
700 701 702 703 704

	dev_put(dev);
	return 0;
}

705
static void ipmr_cache_free_rcu(struct rcu_head *head)
706
{
707 708
	struct mfc_cache *c = container_of(head, struct mfc_cache, rcu);

709 710 711
	kmem_cache_free(mrt_cachep, c);
}

712
void ipmr_cache_free(struct mfc_cache *c)
713 714 715
{
	call_rcu(&c->rcu, ipmr_cache_free_rcu);
}
716
EXPORT_SYMBOL(ipmr_cache_free);
717

L
Linus Torvalds 已提交
718
/* Destroy an unresolved cache entry, killing queued skbs
E
Eric Dumazet 已提交
719
 * and reporting error to netlink readers.
L
Linus Torvalds 已提交
720
 */
721
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
L
Linus Torvalds 已提交
722
{
723
	struct net *net = read_pnet(&mrt->net);
L
Linus Torvalds 已提交
724
	struct sk_buff *skb;
725
	struct nlmsgerr *e;
L
Linus Torvalds 已提交
726

727
	atomic_dec(&mrt->cache_resolve_queue_len);
L
Linus Torvalds 已提交
728

J
Jianjun Kong 已提交
729
	while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
730
		if (ip_hdr(skb)->version == 0) {
731 732
			struct nlmsghdr *nlh = skb_pull(skb,
							sizeof(struct iphdr));
L
Linus Torvalds 已提交
733
			nlh->nlmsg_type = NLMSG_ERROR;
734
			nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
L
Linus Torvalds 已提交
735
			skb_trim(skb, nlh->nlmsg_len);
736
			e = nlmsg_data(nlh);
737 738
			e->error = -ETIMEDOUT;
			memset(&e->msg, 0, sizeof(e->msg));
739

740
			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
E
Eric Dumazet 已提交
741
		} else {
L
Linus Torvalds 已提交
742
			kfree_skb(skb);
E
Eric Dumazet 已提交
743
		}
L
Linus Torvalds 已提交
744 745
	}

746
	ipmr_cache_free(c);
L
Linus Torvalds 已提交
747 748
}

749 750
/* Timer process for the unresolved queue. */
static void ipmr_expire_process(unsigned long arg)
L
Linus Torvalds 已提交
751
{
752
	struct mr_table *mrt = (struct mr_table *)arg;
L
Linus Torvalds 已提交
753 754
	unsigned long now;
	unsigned long expires;
755
	struct mfc_cache *c, *next;
L
Linus Torvalds 已提交
756 757

	if (!spin_trylock(&mfc_unres_lock)) {
758
		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
L
Linus Torvalds 已提交
759 760 761
		return;
	}

762
	if (list_empty(&mrt->mfc_unres_queue))
L
Linus Torvalds 已提交
763 764 765 766 767
		goto out;

	now = jiffies;
	expires = 10*HZ;

768
	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
L
Linus Torvalds 已提交
769 770 771 772 773 774 775
		if (time_after(c->mfc_un.unres.expires, now)) {
			unsigned long interval = c->mfc_un.unres.expires - now;
			if (interval < expires)
				expires = interval;
			continue;
		}

776
		list_del(&c->list);
777
		mroute_netlink_event(mrt, c, RTM_DELROUTE);
778
		ipmr_destroy_unres(mrt, c);
L
Linus Torvalds 已提交
779 780
	}

781 782
	if (!list_empty(&mrt->mfc_unres_queue))
		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
L
Linus Torvalds 已提交
783 784 785 786 787 788

out:
	spin_unlock(&mfc_unres_lock);
}

/* Fill oifs list. It is called under write locked mrt_lock. */
789
static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
790
				   unsigned char *ttls)
L
Linus Torvalds 已提交
791 792 793 794 795 796 797
{
	int vifi;

	cache->mfc_un.res.minvif = MAXVIFS;
	cache->mfc_un.res.maxvif = 0;
	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);

798 799
	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
		if (VIF_EXISTS(mrt, vifi) &&
800
		    ttls[vifi] && ttls[vifi] < 255) {
L
Linus Torvalds 已提交
801 802 803 804 805 806 807
			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
			if (cache->mfc_un.res.minvif > vifi)
				cache->mfc_un.res.minvif = vifi;
			if (cache->mfc_un.res.maxvif <= vifi)
				cache->mfc_un.res.maxvif = vifi + 1;
		}
	}
808
	cache->mfc_un.res.lastuse = jiffies;
L
Linus Torvalds 已提交
809 810
}

811 812
static int vif_add(struct net *net, struct mr_table *mrt,
		   struct vifctl *vifc, int mrtsock)
L
Linus Torvalds 已提交
813 814
{
	int vifi = vifc->vifc_vifi;
815
	struct vif_device *v = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
816 817
	struct net_device *dev;
	struct in_device *in_dev;
818
	int err;
L
Linus Torvalds 已提交
819 820

	/* Is vif busy ? */
821
	if (VIF_EXISTS(mrt, vifi))
L
Linus Torvalds 已提交
822 823 824 825
		return -EADDRINUSE;

	switch (vifc->vifc_flags) {
	case VIFF_REGISTER:
826
		if (!ipmr_pimsm_enabled())
827 828
			return -EINVAL;
		/* Special Purpose VIF in PIM
L
Linus Torvalds 已提交
829 830
		 * All the packets will be sent to the daemon
		 */
831
		if (mrt->mroute_reg_vif_num >= 0)
L
Linus Torvalds 已提交
832
			return -EADDRINUSE;
833
		dev = ipmr_reg_vif(net, mrt);
L
Linus Torvalds 已提交
834 835
		if (!dev)
			return -ENOBUFS;
836 837 838
		err = dev_set_allmulti(dev, 1);
		if (err) {
			unregister_netdevice(dev);
839
			dev_put(dev);
840 841
			return err;
		}
L
Linus Torvalds 已提交
842
		break;
843
	case VIFF_TUNNEL:
844
		dev = ipmr_new_tunnel(net, vifc);
L
Linus Torvalds 已提交
845 846
		if (!dev)
			return -ENOBUFS;
847 848 849
		err = dev_set_allmulti(dev, 1);
		if (err) {
			ipmr_del_tunnel(dev, vifc);
850
			dev_put(dev);
851 852
			return err;
		}
L
Linus Torvalds 已提交
853
		break;
854
	case VIFF_USE_IFINDEX:
L
Linus Torvalds 已提交
855
	case 0:
856 857
		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
858
			if (dev && !__in_dev_get_rtnl(dev)) {
859 860 861
				dev_put(dev);
				return -EADDRNOTAVAIL;
			}
E
Eric Dumazet 已提交
862
		} else {
863
			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
E
Eric Dumazet 已提交
864
		}
L
Linus Torvalds 已提交
865 866
		if (!dev)
			return -EADDRNOTAVAIL;
867
		err = dev_set_allmulti(dev, 1);
868 869
		if (err) {
			dev_put(dev);
870
			return err;
871
		}
L
Linus Torvalds 已提交
872 873 874 875 876
		break;
	default:
		return -EINVAL;
	}

E
Eric Dumazet 已提交
877 878
	in_dev = __in_dev_get_rtnl(dev);
	if (!in_dev) {
879
		dev_put(dev);
L
Linus Torvalds 已提交
880
		return -EADDRNOTAVAIL;
881
	}
882
	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
883 884
	inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING,
				    dev->ifindex, &in_dev->cnf);
L
Linus Torvalds 已提交
885 886
	ip_rt_multicast_event(in_dev);

E
Eric Dumazet 已提交
887 888
	/* Fill in the VIF structures */

J
Jianjun Kong 已提交
889 890 891 892
	v->rate_limit = vifc->vifc_rate_limit;
	v->local = vifc->vifc_lcl_addr.s_addr;
	v->remote = vifc->vifc_rmt_addr.s_addr;
	v->flags = vifc->vifc_flags;
L
Linus Torvalds 已提交
893 894
	if (!mrtsock)
		v->flags |= VIFF_STATIC;
J
Jianjun Kong 已提交
895
	v->threshold = vifc->vifc_threshold;
L
Linus Torvalds 已提交
896 897 898 899 900
	v->bytes_in = 0;
	v->bytes_out = 0;
	v->pkt_in = 0;
	v->pkt_out = 0;
	v->link = dev->ifindex;
E
Eric Dumazet 已提交
901
	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER))
902
		v->link = dev_get_iflink(dev);
L
Linus Torvalds 已提交
903 904 905

	/* And finish update writing critical data */
	write_lock_bh(&mrt_lock);
J
Jianjun Kong 已提交
906
	v->dev = dev;
E
Eric Dumazet 已提交
907
	if (v->flags & VIFF_REGISTER)
908 909 910
		mrt->mroute_reg_vif_num = vifi;
	if (vifi+1 > mrt->maxvif)
		mrt->maxvif = vifi+1;
L
Linus Torvalds 已提交
911 912 913 914
	write_unlock_bh(&mrt_lock);
	return 0;
}

915
/* called with rcu_read_lock() */
916
static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
917 918
					 __be32 origin,
					 __be32 mcastgrp)
L
Linus Torvalds 已提交
919
{
920 921 922 923 924
	struct mfc_cache_cmp_arg arg = {
			.mfc_mcastgrp = mcastgrp,
			.mfc_origin = origin
	};
	struct rhlist_head *tmp, *list;
L
Linus Torvalds 已提交
925 926
	struct mfc_cache *c;

927 928 929 930
	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
	rhl_for_each_entry_rcu(c, tmp, list, mnode)
		return c;

931
	return NULL;
L
Linus Torvalds 已提交
932 933
}

934 935 936 937
/* Look for a (*,*,oif) entry */
static struct mfc_cache *ipmr_cache_find_any_parent(struct mr_table *mrt,
						    int vifi)
{
938 939 940 941 942
	struct mfc_cache_cmp_arg arg = {
			.mfc_mcastgrp = htonl(INADDR_ANY),
			.mfc_origin = htonl(INADDR_ANY)
	};
	struct rhlist_head *tmp, *list;
943 944
	struct mfc_cache *c;

945 946 947
	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
	rhl_for_each_entry_rcu(c, tmp, list, mnode)
		if (c->mfc_un.res.ttls[vifi] < 255)
948 949 950 951 952 953 954 955 956
			return c;

	return NULL;
}

/* Look for a (*,G) entry */
static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
					     __be32 mcastgrp, int vifi)
{
957 958 959 960 961
	struct mfc_cache_cmp_arg arg = {
			.mfc_mcastgrp = mcastgrp,
			.mfc_origin = htonl(INADDR_ANY)
	};
	struct rhlist_head *tmp, *list;
962 963
	struct mfc_cache *c, *proxy;

964
	if (mcastgrp == htonl(INADDR_ANY))
965 966
		goto skip;

967 968 969 970 971 972 973 974 975 976
	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
	rhl_for_each_entry_rcu(c, tmp, list, mnode) {
		if (c->mfc_un.res.ttls[vifi] < 255)
			return c;

		/* It's ok if the vifi is part of the static tree */
		proxy = ipmr_cache_find_any_parent(mrt, c->mfc_parent);
		if (proxy && proxy->mfc_un.res.ttls[vifi] < 255)
			return c;
	}
977 978 979 980 981

skip:
	return ipmr_cache_find_any_parent(mrt, vifi);
}

982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001
/* Look for a (S,G,iif) entry if parent != -1 */
static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
						__be32 origin, __be32 mcastgrp,
						int parent)
{
	struct mfc_cache_cmp_arg arg = {
			.mfc_mcastgrp = mcastgrp,
			.mfc_origin = origin,
	};
	struct rhlist_head *tmp, *list;
	struct mfc_cache *c;

	list = rhltable_lookup(&mrt->mfc_hash, &arg, ipmr_rht_params);
	rhl_for_each_entry_rcu(c, tmp, list, mnode)
		if (parent == -1 || parent == c->mfc_parent)
			return c;

	return NULL;
}

1002
/* Allocate a multicast cache entry */
1003
static struct mfc_cache *ipmr_cache_alloc(void)
L
Linus Torvalds 已提交
1004
{
J
Jianjun Kong 已提交
1005
	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
1006

1007 1008
	if (c) {
		c->mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
1009
		c->mfc_un.res.minvif = MAXVIFS;
1010
		refcount_set(&c->mfc_un.res.refcount, 1);
1011
	}
L
Linus Torvalds 已提交
1012 1013 1014
	return c;
}

1015
static struct mfc_cache *ipmr_cache_alloc_unres(void)
L
Linus Torvalds 已提交
1016
{
J
Jianjun Kong 已提交
1017
	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
1018 1019 1020 1021 1022

	if (c) {
		skb_queue_head_init(&c->mfc_un.unres.unresolved);
		c->mfc_un.unres.expires = jiffies + 10*HZ;
	}
L
Linus Torvalds 已提交
1023 1024 1025
	return c;
}

1026
/* A cache entry has gone into a resolved state from queued */
1027 1028
static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
			       struct mfc_cache *uc, struct mfc_cache *c)
L
Linus Torvalds 已提交
1029 1030
{
	struct sk_buff *skb;
1031
	struct nlmsgerr *e;
L
Linus Torvalds 已提交
1032

E
Eric Dumazet 已提交
1033
	/* Play the pending entries through our router */
J
Jianjun Kong 已提交
1034
	while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
1035
		if (ip_hdr(skb)->version == 0) {
1036 1037
			struct nlmsghdr *nlh = skb_pull(skb,
							sizeof(struct iphdr));
L
Linus Torvalds 已提交
1038

1039
			if (__ipmr_fill_mroute(mrt, skb, c, nlmsg_data(nlh)) > 0) {
E
Eric Dumazet 已提交
1040 1041
				nlh->nlmsg_len = skb_tail_pointer(skb) -
						 (u8 *)nlh;
L
Linus Torvalds 已提交
1042 1043
			} else {
				nlh->nlmsg_type = NLMSG_ERROR;
1044
				nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
L
Linus Torvalds 已提交
1045
				skb_trim(skb, nlh->nlmsg_len);
1046
				e = nlmsg_data(nlh);
1047 1048
				e->error = -EMSGSIZE;
				memset(&e->msg, 0, sizeof(e->msg));
L
Linus Torvalds 已提交
1049
			}
1050

1051
			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
E
Eric Dumazet 已提交
1052
		} else {
1053
			ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
E
Eric Dumazet 已提交
1054
		}
L
Linus Torvalds 已提交
1055 1056 1057
	}
}

1058
/* Bounce a cache query up to mrouted and netlink.
L
Linus Torvalds 已提交
1059
 *
1060
 * Called under mrt_lock.
L
Linus Torvalds 已提交
1061
 */
1062
static int ipmr_cache_report(struct mr_table *mrt,
1063
			     struct sk_buff *pkt, vifi_t vifi, int assert)
L
Linus Torvalds 已提交
1064
{
1065
	const int ihl = ip_hdrlen(pkt);
1066
	struct sock *mroute_sk;
L
Linus Torvalds 已提交
1067 1068
	struct igmphdr *igmp;
	struct igmpmsg *msg;
1069
	struct sk_buff *skb;
L
Linus Torvalds 已提交
1070 1071 1072 1073 1074 1075 1076
	int ret;

	if (assert == IGMPMSG_WHOLEPKT)
		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
	else
		skb = alloc_skb(128, GFP_ATOMIC);

S
Stephen Hemminger 已提交
1077
	if (!skb)
L
Linus Torvalds 已提交
1078 1079 1080 1081
		return -ENOBUFS;

	if (assert == IGMPMSG_WHOLEPKT) {
		/* Ugly, but we have no choice with this interface.
E
Eric Dumazet 已提交
1082 1083 1084
		 * Duplicate old header, fix ihl, length etc.
		 * And all this only to mangle msg->im_msgtype and
		 * to set msg->im_mbz to "mbz" :-)
L
Linus Torvalds 已提交
1085
		 */
1086 1087
		skb_push(skb, sizeof(struct iphdr));
		skb_reset_network_header(skb);
1088
		skb_reset_transport_header(skb);
1089
		msg = (struct igmpmsg *)skb_network_header(skb);
1090
		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
L
Linus Torvalds 已提交
1091 1092
		msg->im_msgtype = IGMPMSG_WHOLEPKT;
		msg->im_mbz = 0;
1093
		msg->im_vif = mrt->mroute_reg_vif_num;
1094 1095 1096
		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
					     sizeof(struct iphdr));
1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107
	} else {
		/* Copy the IP header */
		skb_set_network_header(skb, skb->len);
		skb_put(skb, ihl);
		skb_copy_to_linear_data(skb, pkt->data, ihl);
		/* Flag to the kernel this is a route add */
		ip_hdr(skb)->protocol = 0;
		msg = (struct igmpmsg *)skb_network_header(skb);
		msg->im_vif = vifi;
		skb_dst_set(skb, dst_clone(skb_dst(pkt)));
		/* Add our header */
1108
		igmp = skb_put(skb, sizeof(struct igmphdr));
1109 1110 1111 1112 1113
		igmp->type = assert;
		msg->im_msgtype = assert;
		igmp->code = 0;
		ip_hdr(skb)->tot_len = htons(skb->len);	/* Fix the length */
		skb->transport_header = skb->network_header;
1114
	}
L
Linus Torvalds 已提交
1115

E
Eric Dumazet 已提交
1116 1117
	rcu_read_lock();
	mroute_sk = rcu_dereference(mrt->mroute_sk);
1118
	if (!mroute_sk) {
E
Eric Dumazet 已提交
1119
		rcu_read_unlock();
L
Linus Torvalds 已提交
1120 1121 1122 1123
		kfree_skb(skb);
		return -EINVAL;
	}

1124 1125
	igmpmsg_netlink_event(mrt, skb);

E
Eric Dumazet 已提交
1126
	/* Deliver to mrouted */
E
Eric Dumazet 已提交
1127 1128
	ret = sock_queue_rcv_skb(mroute_sk, skb);
	rcu_read_unlock();
1129
	if (ret < 0) {
1130
		net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
L
Linus Torvalds 已提交
1131 1132 1133 1134 1135 1136
		kfree_skb(skb);
	}

	return ret;
}

1137 1138
/* Queue a packet for resolution. It gets locked cache entry! */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
1139
				 struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
1140
{
1141 1142
	const struct iphdr *iph = ip_hdr(skb);
	struct mfc_cache *c;
1143
	bool found = false;
L
Linus Torvalds 已提交
1144 1145 1146
	int err;

	spin_lock_bh(&mfc_unres_lock);
1147
	list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
1148
		if (c->mfc_mcastgrp == iph->daddr &&
1149 1150
		    c->mfc_origin == iph->saddr) {
			found = true;
L
Linus Torvalds 已提交
1151
			break;
1152
		}
L
Linus Torvalds 已提交
1153 1154
	}

1155
	if (!found) {
E
Eric Dumazet 已提交
1156
		/* Create a new entry if allowable */
1157
		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
1158
		    (c = ipmr_cache_alloc_unres()) == NULL) {
L
Linus Torvalds 已提交
1159 1160 1161 1162 1163 1164
			spin_unlock_bh(&mfc_unres_lock);

			kfree_skb(skb);
			return -ENOBUFS;
		}

E
Eric Dumazet 已提交
1165
		/* Fill in the new cache entry */
1166 1167 1168
		c->mfc_parent	= -1;
		c->mfc_origin	= iph->saddr;
		c->mfc_mcastgrp	= iph->daddr;
L
Linus Torvalds 已提交
1169

E
Eric Dumazet 已提交
1170
		/* Reflect first query at mrouted. */
1171
		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
1172
		if (err < 0) {
1173
			/* If the report failed throw the cache entry
L
Linus Torvalds 已提交
1174 1175 1176 1177
			   out - Brad Parker
			 */
			spin_unlock_bh(&mfc_unres_lock);

1178
			ipmr_cache_free(c);
L
Linus Torvalds 已提交
1179 1180 1181 1182
			kfree_skb(skb);
			return err;
		}

1183 1184
		atomic_inc(&mrt->cache_resolve_queue_len);
		list_add(&c->list, &mrt->mfc_unres_queue);
1185
		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1186

1187 1188
		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
			mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
L
Linus Torvalds 已提交
1189 1190
	}

E
Eric Dumazet 已提交
1191 1192
	/* See if we can append the packet */
	if (c->mfc_un.unres.unresolved.qlen > 3) {
L
Linus Torvalds 已提交
1193 1194 1195
		kfree_skb(skb);
		err = -ENOBUFS;
	} else {
1196 1197 1198 1199
		if (dev) {
			skb->dev = dev;
			skb->skb_iif = dev->ifindex;
		}
J
Jianjun Kong 已提交
1200
		skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
L
Linus Torvalds 已提交
1201 1202 1203 1204 1205 1206 1207
		err = 0;
	}

	spin_unlock_bh(&mfc_unres_lock);
	return err;
}

1208
/* MFC cache manipulation by user space mroute daemon */
L
Linus Torvalds 已提交
1209

1210
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
L
Linus Torvalds 已提交
1211
{
1212
	struct mfc_cache *c;
L
Linus Torvalds 已提交
1213

1214 1215 1216 1217 1218 1219 1220 1221 1222 1223
	/* The entries are added/deleted only under RTNL */
	rcu_read_lock();
	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
				   mfc->mfcc_mcastgrp.s_addr, parent);
	rcu_read_unlock();
	if (!c)
		return -ENOENT;
	rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
	list_del_rcu(&c->list);
	mroute_netlink_event(mrt, c, RTM_DELROUTE);
1224
	ipmr_cache_put(c);
L
Linus Torvalds 已提交
1225

1226
	return 0;
L
Linus Torvalds 已提交
1227 1228
}

1229
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1230
			struct mfcctl *mfc, int mrtsock, int parent)
L
Linus Torvalds 已提交
1231
{
1232
	struct mfc_cache *uc, *c;
1233 1234
	bool found;
	int ret;
L
Linus Torvalds 已提交
1235

1236 1237 1238
	if (mfc->mfcc_parent >= MAXVIFS)
		return -ENFILE;

1239 1240 1241 1242 1243 1244
	/* The entries are added/deleted only under RTNL */
	rcu_read_lock();
	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
				   mfc->mfcc_mcastgrp.s_addr, parent);
	rcu_read_unlock();
	if (c) {
L
Linus Torvalds 已提交
1245 1246
		write_lock_bh(&mrt_lock);
		c->mfc_parent = mfc->mfcc_parent;
1247
		ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
L
Linus Torvalds 已提交
1248 1249 1250
		if (!mrtsock)
			c->mfc_flags |= MFC_STATIC;
		write_unlock_bh(&mrt_lock);
1251
		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1252 1253 1254
		return 0;
	}

1255
	if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
1256
	    !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
L
Linus Torvalds 已提交
1257 1258
		return -EINVAL;

1259
	c = ipmr_cache_alloc();
1260
	if (!c)
L
Linus Torvalds 已提交
1261 1262
		return -ENOMEM;

J
Jianjun Kong 已提交
1263 1264 1265
	c->mfc_origin = mfc->mfcc_origin.s_addr;
	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
	c->mfc_parent = mfc->mfcc_parent;
1266
	ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
L
Linus Torvalds 已提交
1267 1268 1269
	if (!mrtsock)
		c->mfc_flags |= MFC_STATIC;

1270 1271 1272 1273 1274 1275 1276 1277
	ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->mnode,
				  ipmr_rht_params);
	if (ret) {
		pr_err("ipmr: rhtable insert error %d\n", ret);
		ipmr_cache_free(c);
		return ret;
	}
	list_add_tail_rcu(&c->list, &mrt->mfc_cache_list);
1278 1279
	/* Check to see if we resolved a queued list. If so we
	 * need to send on the frames and tidy up.
L
Linus Torvalds 已提交
1280
	 */
1281
	found = false;
L
Linus Torvalds 已提交
1282
	spin_lock_bh(&mfc_unres_lock);
1283
	list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1284
		if (uc->mfc_origin == c->mfc_origin &&
L
Linus Torvalds 已提交
1285
		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1286
			list_del(&uc->list);
1287
			atomic_dec(&mrt->cache_resolve_queue_len);
1288
			found = true;
L
Linus Torvalds 已提交
1289 1290 1291
			break;
		}
	}
1292 1293
	if (list_empty(&mrt->mfc_unres_queue))
		del_timer(&mrt->ipmr_expire_timer);
L
Linus Torvalds 已提交
1294 1295
	spin_unlock_bh(&mfc_unres_lock);

1296
	if (found) {
1297
		ipmr_cache_resolve(net, mrt, uc, c);
1298
		ipmr_cache_free(uc);
L
Linus Torvalds 已提交
1299
	}
1300
	mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1301 1302 1303
	return 0;
}

1304
/* Close the multicast socket, and clear the vif tables etc */
1305
static void mroute_clean_tables(struct mr_table *mrt, bool all)
L
Linus Torvalds 已提交
1306
{
1307
	struct mfc_cache *c, *tmp;
1308
	LIST_HEAD(list);
1309
	int i;
1310

E
Eric Dumazet 已提交
1311
	/* Shut down all active vif entries */
1312
	for (i = 0; i < mrt->maxvif; i++) {
1313 1314 1315
		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
			continue;
		vif_delete(mrt, i, 0, &list);
L
Linus Torvalds 已提交
1316
	}
1317
	unregister_netdevice_many(&list);
L
Linus Torvalds 已提交
1318

E
Eric Dumazet 已提交
1319
	/* Wipe the cache */
1320 1321 1322 1323 1324 1325
	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
		if (!all && (c->mfc_flags & MFC_STATIC))
			continue;
		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
		list_del_rcu(&c->list);
		mroute_netlink_event(mrt, c, RTM_DELROUTE);
1326
		ipmr_cache_put(c);
L
Linus Torvalds 已提交
1327 1328
	}

1329
	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
L
Linus Torvalds 已提交
1330
		spin_lock_bh(&mfc_unres_lock);
1331
		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
1332
			list_del(&c->list);
1333
			mroute_netlink_event(mrt, c, RTM_DELROUTE);
1334
			ipmr_destroy_unres(mrt, c);
L
Linus Torvalds 已提交
1335 1336 1337 1338 1339
		}
		spin_unlock_bh(&mfc_unres_lock);
	}
}

E
Eric Dumazet 已提交
1340 1341 1342
/* called from ip_ra_control(), before an RCU grace period,
 * we dont need to call synchronize_rcu() here
 */
L
Linus Torvalds 已提交
1343 1344
static void mrtsock_destruct(struct sock *sk)
{
1345
	struct net *net = sock_net(sk);
1346
	struct mr_table *mrt;
1347

1348
	ASSERT_RTNL();
1349
	ipmr_for_each_table(mrt, net) {
E
Eric Dumazet 已提交
1350
		if (sk == rtnl_dereference(mrt->mroute_sk)) {
1351
			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1352 1353
			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
						    NETCONFA_MC_FORWARDING,
1354 1355
						    NETCONFA_IFINDEX_ALL,
						    net->ipv4.devconf_all);
1356
			RCU_INIT_POINTER(mrt->mroute_sk, NULL);
1357
			mroute_clean_tables(mrt, false);
1358
		}
L
Linus Torvalds 已提交
1359 1360 1361
	}
}

1362 1363 1364 1365
/* Socket options and virtual interface manipulation. The whole
 * virtual interface system is a complete heap, but unfortunately
 * that's how BSD mrouted happens to think. Maybe one day with a proper
 * MOSPF/PIM router set up we can clean this up.
L
Linus Torvalds 已提交
1366
 */
1367

1368 1369
int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
			 unsigned int optlen)
L
Linus Torvalds 已提交
1370
{
1371
	struct net *net = sock_net(sk);
1372
	int val, ret = 0, parent = 0;
1373
	struct mr_table *mrt;
1374 1375 1376
	struct vifctl vif;
	struct mfcctl mfc;
	u32 uval;
1377

1378 1379
	/* There's one exception to the lock - MRT_DONE which needs to unlock */
	rtnl_lock();
1380
	if (sk->sk_type != SOCK_RAW ||
1381 1382 1383 1384
	    inet_sk(sk)->inet_num != IPPROTO_IGMP) {
		ret = -EOPNOTSUPP;
		goto out_unlock;
	}
1385

1386
	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1387 1388 1389 1390
	if (!mrt) {
		ret = -ENOENT;
		goto out_unlock;
	}
S
Stephen Hemminger 已提交
1391
	if (optname != MRT_INIT) {
1392
		if (sk != rcu_access_pointer(mrt->mroute_sk) &&
1393 1394 1395 1396
		    !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
			ret = -EACCES;
			goto out_unlock;
		}
L
Linus Torvalds 已提交
1397 1398
	}

S
Stephen Hemminger 已提交
1399 1400
	switch (optname) {
	case MRT_INIT:
1401
		if (optlen != sizeof(int)) {
1402
			ret = -EINVAL;
1403 1404 1405
			break;
		}
		if (rtnl_dereference(mrt->mroute_sk)) {
1406 1407
			ret = -EADDRINUSE;
			break;
1408
		}
S
Stephen Hemminger 已提交
1409 1410 1411

		ret = ip_ra_control(sk, 1, mrtsock_destruct);
		if (ret == 0) {
1412
			rcu_assign_pointer(mrt->mroute_sk, sk);
1413
			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1414 1415
			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
						    NETCONFA_MC_FORWARDING,
1416 1417
						    NETCONFA_IFINDEX_ALL,
						    net->ipv4.devconf_all);
S
Stephen Hemminger 已提交
1418
		}
1419
		break;
S
Stephen Hemminger 已提交
1420
	case MRT_DONE:
1421 1422 1423 1424
		if (sk != rcu_access_pointer(mrt->mroute_sk)) {
			ret = -EACCES;
		} else {
			ret = ip_ra_control(sk, 0, NULL);
1425
			goto out_unlock;
1426 1427
		}
		break;
S
Stephen Hemminger 已提交
1428 1429
	case MRT_ADD_VIF:
	case MRT_DEL_VIF:
1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441
		if (optlen != sizeof(vif)) {
			ret = -EINVAL;
			break;
		}
		if (copy_from_user(&vif, optval, sizeof(vif))) {
			ret = -EFAULT;
			break;
		}
		if (vif.vifc_vifi >= MAXVIFS) {
			ret = -ENFILE;
			break;
		}
J
Jianjun Kong 已提交
1442
		if (optname == MRT_ADD_VIF) {
E
Eric Dumazet 已提交
1443 1444
			ret = vif_add(net, mrt, &vif,
				      sk == rtnl_dereference(mrt->mroute_sk));
S
Stephen Hemminger 已提交
1445
		} else {
1446
			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
S
Stephen Hemminger 已提交
1447
		}
1448
		break;
1449 1450 1451
	/* Manipulate the forwarding caches. These live
	 * in a sort of kernel/user symbiosis.
	 */
S
Stephen Hemminger 已提交
1452 1453
	case MRT_ADD_MFC:
	case MRT_DEL_MFC:
1454 1455 1456
		parent = -1;
	case MRT_ADD_MFC_PROXY:
	case MRT_DEL_MFC_PROXY:
1457 1458 1459 1460 1461 1462 1463 1464
		if (optlen != sizeof(mfc)) {
			ret = -EINVAL;
			break;
		}
		if (copy_from_user(&mfc, optval, sizeof(mfc))) {
			ret = -EFAULT;
			break;
		}
1465 1466 1467 1468
		if (parent == 0)
			parent = mfc.mfcc_parent;
		if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
			ret = ipmr_mfc_delete(mrt, &mfc, parent);
S
Stephen Hemminger 已提交
1469
		else
E
Eric Dumazet 已提交
1470
			ret = ipmr_mfc_add(net, mrt, &mfc,
1471 1472
					   sk == rtnl_dereference(mrt->mroute_sk),
					   parent);
1473
		break;
1474
	/* Control PIM assert. */
S
Stephen Hemminger 已提交
1475
	case MRT_ASSERT:
1476 1477 1478 1479 1480 1481 1482 1483 1484 1485
		if (optlen != sizeof(val)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(val, (int __user *)optval)) {
			ret = -EFAULT;
			break;
		}
		mrt->mroute_do_assert = val;
		break;
S
Stephen Hemminger 已提交
1486
	case MRT_PIM:
1487
		if (!ipmr_pimsm_enabled()) {
1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498
			ret = -ENOPROTOOPT;
			break;
		}
		if (optlen != sizeof(val)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(val, (int __user *)optval)) {
			ret = -EFAULT;
			break;
		}
S
Stephen Hemminger 已提交
1499

1500 1501 1502 1503
		val = !!val;
		if (val != mrt->mroute_do_pim) {
			mrt->mroute_do_pim = val;
			mrt->mroute_do_assert = val;
L
Linus Torvalds 已提交
1504
		}
1505
		break;
1506
	case MRT_TABLE:
1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518
		if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
			ret = -ENOPROTOOPT;
			break;
		}
		if (optlen != sizeof(uval)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(uval, (u32 __user *)optval)) {
			ret = -EFAULT;
			break;
		}
1519

E
Eric Dumazet 已提交
1520 1521 1522
		if (sk == rtnl_dereference(mrt->mroute_sk)) {
			ret = -EBUSY;
		} else {
1523
			mrt = ipmr_new_table(net, uval);
1524 1525
			if (IS_ERR(mrt))
				ret = PTR_ERR(mrt);
1526
			else
1527
				raw_sk(sk)->ipmr_table = uval;
E
Eric Dumazet 已提交
1528
		}
1529
		break;
1530
	/* Spurious command, or MRT_VERSION which you cannot set. */
S
Stephen Hemminger 已提交
1531
	default:
1532
		ret = -ENOPROTOOPT;
L
Linus Torvalds 已提交
1533
	}
1534 1535 1536
out_unlock:
	rtnl_unlock();
	return ret;
L
Linus Torvalds 已提交
1537 1538
}

1539
/* Getsock opt support for the multicast routing system. */
J
Jianjun Kong 已提交
1540
int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
L
Linus Torvalds 已提交
1541 1542 1543
{
	int olr;
	int val;
1544
	struct net *net = sock_net(sk);
1545 1546
	struct mr_table *mrt;

1547 1548 1549 1550
	if (sk->sk_type != SOCK_RAW ||
	    inet_sk(sk)->inet_num != IPPROTO_IGMP)
		return -EOPNOTSUPP;

1551
	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1552
	if (!mrt)
1553
		return -ENOENT;
L
Linus Torvalds 已提交
1554

1555 1556 1557 1558 1559
	switch (optname) {
	case MRT_VERSION:
		val = 0x0305;
		break;
	case MRT_PIM:
1560
		if (!ipmr_pimsm_enabled())
1561 1562 1563 1564 1565 1566 1567
			return -ENOPROTOOPT;
		val = mrt->mroute_do_pim;
		break;
	case MRT_ASSERT:
		val = mrt->mroute_do_assert;
		break;
	default:
L
Linus Torvalds 已提交
1568
		return -ENOPROTOOPT;
1569
	}
L
Linus Torvalds 已提交
1570 1571 1572 1573 1574 1575

	if (get_user(olr, optlen))
		return -EFAULT;
	olr = min_t(unsigned int, olr, sizeof(int));
	if (olr < 0)
		return -EINVAL;
J
Jianjun Kong 已提交
1576
	if (put_user(olr, optlen))
L
Linus Torvalds 已提交
1577
		return -EFAULT;
J
Jianjun Kong 已提交
1578
	if (copy_to_user(optval, &val, olr))
L
Linus Torvalds 已提交
1579 1580 1581 1582
		return -EFAULT;
	return 0;
}

1583
/* The IP multicast ioctl support routines. */
L
Linus Torvalds 已提交
1584 1585 1586 1587 1588 1589
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
{
	struct sioc_sg_req sr;
	struct sioc_vif_req vr;
	struct vif_device *vif;
	struct mfc_cache *c;
1590
	struct net *net = sock_net(sk);
1591 1592 1593
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1594
	if (!mrt)
1595
		return -ENOENT;
1596

S
Stephen Hemminger 已提交
1597 1598
	switch (cmd) {
	case SIOCGETVIFCNT:
J
Jianjun Kong 已提交
1599
		if (copy_from_user(&vr, arg, sizeof(vr)))
S
Stephen Hemminger 已提交
1600
			return -EFAULT;
1601
		if (vr.vifi >= mrt->maxvif)
S
Stephen Hemminger 已提交
1602 1603
			return -EINVAL;
		read_lock(&mrt_lock);
1604 1605
		vif = &mrt->vif_table[vr.vifi];
		if (VIF_EXISTS(mrt, vr.vifi)) {
J
Jianjun Kong 已提交
1606 1607 1608 1609
			vr.icount = vif->pkt_in;
			vr.ocount = vif->pkt_out;
			vr.ibytes = vif->bytes_in;
			vr.obytes = vif->bytes_out;
L
Linus Torvalds 已提交
1610 1611
			read_unlock(&mrt_lock);

J
Jianjun Kong 已提交
1612
			if (copy_to_user(arg, &vr, sizeof(vr)))
S
Stephen Hemminger 已提交
1613 1614 1615 1616 1617 1618
				return -EFAULT;
			return 0;
		}
		read_unlock(&mrt_lock);
		return -EADDRNOTAVAIL;
	case SIOCGETSGCNT:
J
Jianjun Kong 已提交
1619
		if (copy_from_user(&sr, arg, sizeof(sr)))
S
Stephen Hemminger 已提交
1620 1621
			return -EFAULT;

1622
		rcu_read_lock();
1623
		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
S
Stephen Hemminger 已提交
1624 1625 1626 1627
		if (c) {
			sr.pktcnt = c->mfc_un.res.pkt;
			sr.bytecnt = c->mfc_un.res.bytes;
			sr.wrong_if = c->mfc_un.res.wrong_if;
1628
			rcu_read_unlock();
S
Stephen Hemminger 已提交
1629

J
Jianjun Kong 已提交
1630
			if (copy_to_user(arg, &sr, sizeof(sr)))
S
Stephen Hemminger 已提交
1631 1632 1633
				return -EFAULT;
			return 0;
		}
1634
		rcu_read_unlock();
S
Stephen Hemminger 已提交
1635 1636 1637
		return -EADDRNOTAVAIL;
	default:
		return -ENOIOCTLCMD;
L
Linus Torvalds 已提交
1638 1639 1640
	}
}

1641 1642 1643 1644 1645 1646 1647 1648 1649
#ifdef CONFIG_COMPAT
struct compat_sioc_sg_req {
	struct in_addr src;
	struct in_addr grp;
	compat_ulong_t pktcnt;
	compat_ulong_t bytecnt;
	compat_ulong_t wrong_if;
};

1650 1651 1652 1653 1654 1655 1656 1657
struct compat_sioc_vif_req {
	vifi_t	vifi;		/* Which iface */
	compat_ulong_t icount;
	compat_ulong_t ocount;
	compat_ulong_t ibytes;
	compat_ulong_t obytes;
};

1658 1659
int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
1660
	struct compat_sioc_sg_req sr;
1661 1662
	struct compat_sioc_vif_req vr;
	struct vif_device *vif;
1663 1664 1665 1666 1667
	struct mfc_cache *c;
	struct net *net = sock_net(sk);
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1668
	if (!mrt)
1669 1670 1671
		return -ENOENT;

	switch (cmd) {
1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 1690 1691
	case SIOCGETVIFCNT:
		if (copy_from_user(&vr, arg, sizeof(vr)))
			return -EFAULT;
		if (vr.vifi >= mrt->maxvif)
			return -EINVAL;
		read_lock(&mrt_lock);
		vif = &mrt->vif_table[vr.vifi];
		if (VIF_EXISTS(mrt, vr.vifi)) {
			vr.icount = vif->pkt_in;
			vr.ocount = vif->pkt_out;
			vr.ibytes = vif->bytes_in;
			vr.obytes = vif->bytes_out;
			read_unlock(&mrt_lock);

			if (copy_to_user(arg, &vr, sizeof(vr)))
				return -EFAULT;
			return 0;
		}
		read_unlock(&mrt_lock);
		return -EADDRNOTAVAIL;
1692 1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715
	case SIOCGETSGCNT:
		if (copy_from_user(&sr, arg, sizeof(sr)))
			return -EFAULT;

		rcu_read_lock();
		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
		if (c) {
			sr.pktcnt = c->mfc_un.res.pkt;
			sr.bytecnt = c->mfc_un.res.bytes;
			sr.wrong_if = c->mfc_un.res.wrong_if;
			rcu_read_unlock();

			if (copy_to_user(arg, &sr, sizeof(sr)))
				return -EFAULT;
			return 0;
		}
		rcu_read_unlock();
		return -EADDRNOTAVAIL;
	default:
		return -ENOIOCTLCMD;
	}
}
#endif

L
Linus Torvalds 已提交
1716 1717
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
1718
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1719
	struct net *net = dev_net(dev);
1720
	struct mr_table *mrt;
L
Linus Torvalds 已提交
1721 1722
	struct vif_device *v;
	int ct;
1723

L
Linus Torvalds 已提交
1724 1725
	if (event != NETDEV_UNREGISTER)
		return NOTIFY_DONE;
1726 1727 1728 1729 1730

	ipmr_for_each_table(mrt, net) {
		v = &mrt->vif_table[0];
		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
			if (v->dev == dev)
1731
				vif_delete(mrt, ct, 1, NULL);
1732
		}
L
Linus Torvalds 已提交
1733 1734 1735 1736
	}
	return NOTIFY_DONE;
}

J
Jianjun Kong 已提交
1737
static struct notifier_block ip_mr_notifier = {
L
Linus Torvalds 已提交
1738 1739 1740
	.notifier_call = ipmr_device_event,
};

1741 1742 1743
/* Encapsulate a packet by attaching a valid IPIP header to it.
 * This avoids tunnel drivers and other mess and gives us the speed so
 * important for multicast video.
L
Linus Torvalds 已提交
1744
 */
1745 1746
static void ip_encap(struct net *net, struct sk_buff *skb,
		     __be32 saddr, __be32 daddr)
L
Linus Torvalds 已提交
1747
{
1748
	struct iphdr *iph;
1749
	const struct iphdr *old_iph = ip_hdr(skb);
1750 1751

	skb_push(skb, sizeof(struct iphdr));
1752
	skb->transport_header = skb->network_header;
1753
	skb_reset_network_header(skb);
1754
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
1755

E
Eric Dumazet 已提交
1756
	iph->version	=	4;
1757 1758
	iph->tos	=	old_iph->tos;
	iph->ttl	=	old_iph->ttl;
L
Linus Torvalds 已提交
1759 1760 1761 1762 1763 1764
	iph->frag_off	=	0;
	iph->daddr	=	daddr;
	iph->saddr	=	saddr;
	iph->protocol	=	IPPROTO_IPIP;
	iph->ihl	=	5;
	iph->tot_len	=	htons(skb->len);
1765
	ip_select_ident(net, skb, NULL);
L
Linus Torvalds 已提交
1766 1767 1768 1769 1770 1771
	ip_send_check(iph);

	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
	nf_reset(skb);
}

1772 1773
static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
				      struct sk_buff *skb)
L
Linus Torvalds 已提交
1774
{
E
Eric Dumazet 已提交
1775
	struct ip_options *opt = &(IPCB(skb)->opt);
L
Linus Torvalds 已提交
1776

1777 1778
	IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
	IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
L
Linus Torvalds 已提交
1779 1780 1781 1782

	if (unlikely(opt->optlen))
		ip_forward_options(skb);

1783
	return dst_output(net, sk, skb);
L
Linus Torvalds 已提交
1784 1785
}

1786
/* Processing handlers for ipmr_forward */
L
Linus Torvalds 已提交
1787

1788 1789
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
			    struct sk_buff *skb, struct mfc_cache *c, int vifi)
L
Linus Torvalds 已提交
1790
{
1791
	const struct iphdr *iph = ip_hdr(skb);
1792
	struct vif_device *vif = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
1793 1794
	struct net_device *dev;
	struct rtable *rt;
1795
	struct flowi4 fl4;
L
Linus Torvalds 已提交
1796 1797
	int    encap = 0;

1798
	if (!vif->dev)
L
Linus Torvalds 已提交
1799 1800 1801 1802
		goto out_free;

	if (vif->flags & VIFF_REGISTER) {
		vif->pkt_out++;
J
Jianjun Kong 已提交
1803
		vif->bytes_out += skb->len;
1804 1805
		vif->dev->stats.tx_bytes += skb->len;
		vif->dev->stats.tx_packets++;
1806
		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1807
		goto out_free;
L
Linus Torvalds 已提交
1808 1809
	}

E
Eric Dumazet 已提交
1810
	if (vif->flags & VIFF_TUNNEL) {
1811
		rt = ip_route_output_ports(net, &fl4, NULL,
1812 1813 1814 1815
					   vif->remote, vif->local,
					   0, 0,
					   IPPROTO_IPIP,
					   RT_TOS(iph->tos), vif->link);
1816
		if (IS_ERR(rt))
L
Linus Torvalds 已提交
1817 1818 1819
			goto out_free;
		encap = sizeof(struct iphdr);
	} else {
1820
		rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
1821 1822 1823
					   0, 0,
					   IPPROTO_IPIP,
					   RT_TOS(iph->tos), vif->link);
1824
		if (IS_ERR(rt))
L
Linus Torvalds 已提交
1825 1826 1827
			goto out_free;
	}

1828
	dev = rt->dst.dev;
L
Linus Torvalds 已提交
1829

1830
	if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
L
Linus Torvalds 已提交
1831
		/* Do not fragment multicasts. Alas, IPv4 does not
E
Eric Dumazet 已提交
1832 1833
		 * allow to send ICMP, so that packets will disappear
		 * to blackhole.
L
Linus Torvalds 已提交
1834
		 */
1835
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
1836 1837 1838 1839
		ip_rt_put(rt);
		goto out_free;
	}

1840
	encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
L
Linus Torvalds 已提交
1841 1842

	if (skb_cow(skb, encap)) {
1843
		ip_rt_put(rt);
L
Linus Torvalds 已提交
1844 1845 1846 1847
		goto out_free;
	}

	vif->pkt_out++;
J
Jianjun Kong 已提交
1848
	vif->bytes_out += skb->len;
L
Linus Torvalds 已提交
1849

E
Eric Dumazet 已提交
1850
	skb_dst_drop(skb);
1851
	skb_dst_set(skb, &rt->dst);
1852
	ip_decrease_ttl(ip_hdr(skb));
L
Linus Torvalds 已提交
1853 1854

	/* FIXME: forward and output firewalls used to be called here.
E
Eric Dumazet 已提交
1855 1856
	 * What do we do with netfilter? -- RR
	 */
L
Linus Torvalds 已提交
1857
	if (vif->flags & VIFF_TUNNEL) {
1858
		ip_encap(net, skb, vif->local, vif->remote);
L
Linus Torvalds 已提交
1859
		/* FIXME: extra output firewall step used to be here. --RR */
1860 1861
		vif->dev->stats.tx_packets++;
		vif->dev->stats.tx_bytes += skb->len;
L
Linus Torvalds 已提交
1862 1863
	}

1864
	IPCB(skb)->flags |= IPSKB_FORWARDED;
L
Linus Torvalds 已提交
1865

1866
	/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
L
Linus Torvalds 已提交
1867 1868 1869 1870 1871 1872 1873 1874 1875
	 * not only before forwarding, but after forwarding on all output
	 * interfaces. It is clear, if mrouter runs a multicasting
	 * program, it should receive packets not depending to what interface
	 * program is joined.
	 * If we will not make it, the program will have to join on all
	 * interfaces. On the other hand, multihoming host (or router, but
	 * not mrouter) cannot join to more than one interface - it will
	 * result in receiving multiple packets.
	 */
1876 1877
	NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
		net, NULL, skb, skb->dev, dev,
L
Linus Torvalds 已提交
1878 1879 1880 1881 1882 1883 1884
		ipmr_forward_finish);
	return;

out_free:
	kfree_skb(skb);
}

1885
static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
L
Linus Torvalds 已提交
1886 1887
{
	int ct;
1888 1889 1890

	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
		if (mrt->vif_table[ct].dev == dev)
L
Linus Torvalds 已提交
1891 1892 1893 1894 1895 1896
			break;
	}
	return ct;
}

/* "local" means that we should preserve one skb (for local delivery) */
1897
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
1898 1899
			  struct net_device *dev, struct sk_buff *skb,
			  struct mfc_cache *cache, int local)
L
Linus Torvalds 已提交
1900
{
1901
	int true_vifi = ipmr_find_vif(mrt, dev);
L
Linus Torvalds 已提交
1902 1903 1904 1905 1906 1907
	int psend = -1;
	int vif, ct;

	vif = cache->mfc_parent;
	cache->mfc_un.res.pkt++;
	cache->mfc_un.res.bytes += skb->len;
1908
	cache->mfc_un.res.lastuse = jiffies;
L
Linus Torvalds 已提交
1909

1910
	if (cache->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921
		struct mfc_cache *cache_proxy;

		/* For an (*,G) entry, we only check that the incomming
		 * interface is part of the static tree.
		 */
		cache_proxy = ipmr_cache_find_any_parent(mrt, vif);
		if (cache_proxy &&
		    cache_proxy->mfc_un.res.ttls[true_vifi] < 255)
			goto forward;
	}

1922
	/* Wrong interface: drop packet and (maybe) send PIM assert. */
1923
	if (mrt->vif_table[vif].dev != dev) {
1924
		if (rt_is_output_route(skb_rtable(skb))) {
L
Linus Torvalds 已提交
1925
			/* It is our own packet, looped back.
E
Eric Dumazet 已提交
1926 1927 1928 1929 1930 1931 1932 1933 1934
			 * Very complicated situation...
			 *
			 * The best workaround until routing daemons will be
			 * fixed is not to redistribute packet, if it was
			 * send through wrong interface. It means, that
			 * multicast applications WILL NOT work for
			 * (S,G), which have default multicast route pointing
			 * to wrong oif. In any case, it is not a good
			 * idea to use multicasting applications on router.
L
Linus Torvalds 已提交
1935 1936 1937 1938 1939 1940
			 */
			goto dont_forward;
		}

		cache->mfc_un.res.wrong_if++;

1941
		if (true_vifi >= 0 && mrt->mroute_do_assert &&
L
Linus Torvalds 已提交
1942
		    /* pimsm uses asserts, when switching from RPT to SPT,
E
Eric Dumazet 已提交
1943 1944 1945
		     * so that we cannot check that packet arrived on an oif.
		     * It is bad, but otherwise we would need to move pretty
		     * large chunk of pimd to kernel. Ough... --ANK
L
Linus Torvalds 已提交
1946
		     */
1947
		    (mrt->mroute_do_pim ||
1948
		     cache->mfc_un.res.ttls[true_vifi] < 255) &&
1949
		    time_after(jiffies,
L
Linus Torvalds 已提交
1950 1951
			       cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
			cache->mfc_un.res.last_assert = jiffies;
1952
			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
L
Linus Torvalds 已提交
1953 1954 1955 1956
		}
		goto dont_forward;
	}

1957
forward:
1958 1959
	mrt->vif_table[vif].pkt_in++;
	mrt->vif_table[vif].bytes_in += skb->len;
L
Linus Torvalds 已提交
1960

1961
	/* Forward the frame */
1962 1963
	if (cache->mfc_origin == htonl(INADDR_ANY) &&
	    cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976
		if (true_vifi >= 0 &&
		    true_vifi != cache->mfc_parent &&
		    ip_hdr(skb)->ttl >
				cache->mfc_un.res.ttls[cache->mfc_parent]) {
			/* It's an (*,*) entry and the packet is not coming from
			 * the upstream: forward the packet to the upstream
			 * only.
			 */
			psend = cache->mfc_parent;
			goto last_forward;
		}
		goto dont_forward;
	}
E
Eric Dumazet 已提交
1977 1978
	for (ct = cache->mfc_un.res.maxvif - 1;
	     ct >= cache->mfc_un.res.minvif; ct--) {
1979
		/* For (*,G) entry, don't forward to the incoming interface */
1980 1981
		if ((cache->mfc_origin != htonl(INADDR_ANY) ||
		     ct != true_vifi) &&
1982
		    ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
L
Linus Torvalds 已提交
1983 1984
			if (psend != -1) {
				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
E
Eric Dumazet 已提交
1985

L
Linus Torvalds 已提交
1986
				if (skb2)
1987 1988
					ipmr_queue_xmit(net, mrt, skb2, cache,
							psend);
L
Linus Torvalds 已提交
1989
			}
J
Jianjun Kong 已提交
1990
			psend = ct;
L
Linus Torvalds 已提交
1991 1992
		}
	}
1993
last_forward:
L
Linus Torvalds 已提交
1994 1995 1996
	if (psend != -1) {
		if (local) {
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
E
Eric Dumazet 已提交
1997

L
Linus Torvalds 已提交
1998
			if (skb2)
1999
				ipmr_queue_xmit(net, mrt, skb2, cache, psend);
L
Linus Torvalds 已提交
2000
		} else {
2001
			ipmr_queue_xmit(net, mrt, skb, cache, psend);
2002
			return;
L
Linus Torvalds 已提交
2003 2004 2005 2006 2007 2008 2009 2010
		}
	}

dont_forward:
	if (!local)
		kfree_skb(skb);
}

2011
static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
2012
{
2013 2014
	struct rtable *rt = skb_rtable(skb);
	struct iphdr *iph = ip_hdr(skb);
D
David S. Miller 已提交
2015
	struct flowi4 fl4 = {
2016 2017
		.daddr = iph->daddr,
		.saddr = iph->saddr,
2018
		.flowi4_tos = RT_TOS(iph->tos),
D
David S. Miller 已提交
2019 2020 2021
		.flowi4_oif = (rt_is_output_route(rt) ?
			       skb->dev->ifindex : 0),
		.flowi4_iif = (rt_is_output_route(rt) ?
2022
			       LOOPBACK_IFINDEX :
D
David S. Miller 已提交
2023
			       skb->dev->ifindex),
2024
		.flowi4_mark = skb->mark,
2025 2026 2027 2028
	};
	struct mr_table *mrt;
	int err;

D
David S. Miller 已提交
2029
	err = ipmr_fib_lookup(net, &fl4, &mrt);
2030 2031 2032 2033
	if (err)
		return ERR_PTR(err);
	return mrt;
}
L
Linus Torvalds 已提交
2034

2035 2036
/* Multicast packets for forwarding arrive here
 * Called with rcu_read_lock();
L
Linus Torvalds 已提交
2037 2038 2039 2040
 */
int ip_mr_input(struct sk_buff *skb)
{
	struct mfc_cache *cache;
2041
	struct net *net = dev_net(skb->dev);
E
Eric Dumazet 已提交
2042
	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
2043
	struct mr_table *mrt;
2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055 2056 2057
	struct net_device *dev;

	/* skb->dev passed in is the loX master dev for vrfs.
	 * As there are no vifs associated with loopback devices,
	 * get the proper interface that does have a vif associated with it.
	 */
	dev = skb->dev;
	if (netif_is_l3_master(skb->dev)) {
		dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
		if (!dev) {
			kfree_skb(skb);
			return -ENODEV;
		}
	}
L
Linus Torvalds 已提交
2058 2059

	/* Packet is looped back after forward, it should not be
E
Eric Dumazet 已提交
2060
	 * forwarded second time, but still can be delivered locally.
L
Linus Torvalds 已提交
2061
	 */
E
Eric Dumazet 已提交
2062
	if (IPCB(skb)->flags & IPSKB_FORWARDED)
L
Linus Torvalds 已提交
2063 2064
		goto dont_forward;

2065
	mrt = ipmr_rt_fib_lookup(net, skb);
2066 2067 2068
	if (IS_ERR(mrt)) {
		kfree_skb(skb);
		return PTR_ERR(mrt);
2069
	}
L
Linus Torvalds 已提交
2070
	if (!local) {
E
Eric Dumazet 已提交
2071 2072 2073 2074 2075 2076 2077 2078 2079 2080 2081 2082 2083 2084 2085 2086 2087 2088
		if (IPCB(skb)->opt.router_alert) {
			if (ip_call_ra_chain(skb))
				return 0;
		} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
			/* IGMPv1 (and broken IGMPv2 implementations sort of
			 * Cisco IOS <= 11.2(8)) do not put router alert
			 * option to IGMP packets destined to routable
			 * groups. It is very bad, because it means
			 * that we can forward NO IGMP messages.
			 */
			struct sock *mroute_sk;

			mroute_sk = rcu_dereference(mrt->mroute_sk);
			if (mroute_sk) {
				nf_reset(skb);
				raw_rcv(mroute_sk, skb);
				return 0;
			}
L
Linus Torvalds 已提交
2089 2090 2091
		    }
	}

2092
	/* already under rcu_read_lock() */
2093
	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
2094
	if (!cache) {
2095
		int vif = ipmr_find_vif(mrt, dev);
2096 2097 2098 2099 2100

		if (vif >= 0)
			cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
						    vif);
	}
L
Linus Torvalds 已提交
2101

2102
	/* No usable cache entry */
2103
	if (!cache) {
L
Linus Torvalds 已提交
2104 2105 2106 2107 2108
		int vif;

		if (local) {
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
			ip_local_deliver(skb);
2109
			if (!skb2)
L
Linus Torvalds 已提交
2110 2111 2112 2113
				return -ENOBUFS;
			skb = skb2;
		}

2114
		read_lock(&mrt_lock);
2115
		vif = ipmr_find_vif(mrt, dev);
L
Linus Torvalds 已提交
2116
		if (vif >= 0) {
2117
			int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
L
Linus Torvalds 已提交
2118 2119
			read_unlock(&mrt_lock);

2120
			return err2;
L
Linus Torvalds 已提交
2121 2122 2123 2124 2125 2126
		}
		read_unlock(&mrt_lock);
		kfree_skb(skb);
		return -ENODEV;
	}

2127
	read_lock(&mrt_lock);
2128
	ip_mr_forward(net, mrt, dev, skb, cache, local);
L
Linus Torvalds 已提交
2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142
	read_unlock(&mrt_lock);

	if (local)
		return ip_local_deliver(skb);

	return 0;

dont_forward:
	if (local)
		return ip_local_deliver(skb);
	kfree_skb(skb);
	return 0;
}

I
Ilpo Järvinen 已提交
2143
#ifdef CONFIG_IP_PIMSM_V1
2144
/* Handle IGMP messages of PIMv1 */
E
Eric Dumazet 已提交
2145
int pim_rcv_v1(struct sk_buff *skb)
I
Ilpo Järvinen 已提交
2146 2147
{
	struct igmphdr *pim;
2148
	struct net *net = dev_net(skb->dev);
2149
	struct mr_table *mrt;
I
Ilpo Järvinen 已提交
2150 2151 2152 2153 2154 2155

	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
		goto drop;

	pim = igmp_hdr(skb);

2156
	mrt = ipmr_rt_fib_lookup(net, skb);
2157 2158
	if (IS_ERR(mrt))
		goto drop;
2159
	if (!mrt->mroute_do_pim ||
I
Ilpo Järvinen 已提交
2160 2161 2162
	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
		goto drop;

2163
	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
I
Ilpo Järvinen 已提交
2164 2165 2166
drop:
		kfree_skb(skb);
	}
L
Linus Torvalds 已提交
2167 2168 2169 2170 2171
	return 0;
}
#endif

#ifdef CONFIG_IP_PIMSM_V2
E
Eric Dumazet 已提交
2172
static int pim_rcv(struct sk_buff *skb)
L
Linus Torvalds 已提交
2173 2174
{
	struct pimreghdr *pim;
2175 2176
	struct net *net = dev_net(skb->dev);
	struct mr_table *mrt;
L
Linus Torvalds 已提交
2177

I
Ilpo Järvinen 已提交
2178
	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
L
Linus Torvalds 已提交
2179 2180
		goto drop;

2181
	pim = (struct pimreghdr *)skb_transport_header(skb);
2182
	if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
E
Eric Dumazet 已提交
2183
	    (pim->flags & PIM_NULL_REGISTER) ||
2184
	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
2185
	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
L
Linus Torvalds 已提交
2186 2187
		goto drop;

2188
	mrt = ipmr_rt_fib_lookup(net, skb);
2189 2190
	if (IS_ERR(mrt))
		goto drop;
2191
	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
I
Ilpo Järvinen 已提交
2192 2193 2194
drop:
		kfree_skb(skb);
	}
L
Linus Torvalds 已提交
2195 2196 2197 2198
	return 0;
}
#endif

2199 2200
static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
			      struct mfc_cache *c, struct rtmsg *rtm)
L
Linus Torvalds 已提交
2201
{
2202
	struct rta_mfc_stats mfcs;
2203 2204
	struct nlattr *mp_attr;
	struct rtnexthop *nhp;
2205
	unsigned long lastuse;
2206
	int ct;
L
Linus Torvalds 已提交
2207

2208
	/* If cache is unresolved, don't try to parse IIF and OIF */
2209 2210
	if (c->mfc_parent >= MAXVIFS) {
		rtm->rtm_flags |= RTNH_F_UNRESOLVED;
2211
		return -ENOENT;
2212
	}
2213

T
Thomas Graf 已提交
2214 2215 2216
	if (VIF_EXISTS(mrt, c->mfc_parent) &&
	    nla_put_u32(skb, RTA_IIF, mrt->vif_table[c->mfc_parent].dev->ifindex) < 0)
		return -EMSGSIZE;
L
Linus Torvalds 已提交
2217

T
Thomas Graf 已提交
2218 2219
	if (!(mp_attr = nla_nest_start(skb, RTA_MULTIPATH)))
		return -EMSGSIZE;
L
Linus Torvalds 已提交
2220 2221

	for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
2222
		if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
T
Thomas Graf 已提交
2223 2224 2225 2226 2227
			if (!(nhp = nla_reserve_nohdr(skb, sizeof(*nhp)))) {
				nla_nest_cancel(skb, mp_attr);
				return -EMSGSIZE;
			}

L
Linus Torvalds 已提交
2228 2229
			nhp->rtnh_flags = 0;
			nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
2230
			nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
L
Linus Torvalds 已提交
2231 2232 2233
			nhp->rtnh_len = sizeof(*nhp);
		}
	}
T
Thomas Graf 已提交
2234 2235 2236

	nla_nest_end(skb, mp_attr);

2237 2238 2239
	lastuse = READ_ONCE(c->mfc_un.res.lastuse);
	lastuse = time_after_eq(jiffies, lastuse) ? jiffies - lastuse : 0;

2240 2241 2242
	mfcs.mfcs_packets = c->mfc_un.res.pkt;
	mfcs.mfcs_bytes = c->mfc_un.res.bytes;
	mfcs.mfcs_wrong_if = c->mfc_un.res.wrong_if;
2243
	if (nla_put_64bit(skb, RTA_MFC_STATS, sizeof(mfcs), &mfcs, RTA_PAD) ||
2244
	    nla_put_u64_64bit(skb, RTA_EXPIRES, jiffies_to_clock_t(lastuse),
2245
			      RTA_PAD))
2246 2247
		return -EMSGSIZE;

L
Linus Torvalds 已提交
2248 2249 2250 2251
	rtm->rtm_type = RTN_MULTICAST;
	return 1;
}

2252 2253
int ipmr_get_route(struct net *net, struct sk_buff *skb,
		   __be32 saddr, __be32 daddr,
2254
		   struct rtmsg *rtm, u32 portid)
L
Linus Torvalds 已提交
2255 2256
{
	struct mfc_cache *cache;
2257 2258
	struct mr_table *mrt;
	int err;
L
Linus Torvalds 已提交
2259

2260
	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2261
	if (!mrt)
2262 2263
		return -ENOENT;

2264
	rcu_read_lock();
2265
	cache = ipmr_cache_find(mrt, saddr, daddr);
2266
	if (!cache && skb->dev) {
2267
		int vif = ipmr_find_vif(mrt, skb->dev);
L
Linus Torvalds 已提交
2268

2269 2270 2271
		if (vif >= 0)
			cache = ipmr_cache_find_any(mrt, daddr, vif);
	}
2272
	if (!cache) {
2273
		struct sk_buff *skb2;
2274
		struct iphdr *iph;
L
Linus Torvalds 已提交
2275
		struct net_device *dev;
E
Eric Dumazet 已提交
2276
		int vif = -1;
L
Linus Torvalds 已提交
2277 2278

		dev = skb->dev;
2279
		read_lock(&mrt_lock);
E
Eric Dumazet 已提交
2280 2281 2282
		if (dev)
			vif = ipmr_find_vif(mrt, dev);
		if (vif < 0) {
L
Linus Torvalds 已提交
2283
			read_unlock(&mrt_lock);
2284
			rcu_read_unlock();
L
Linus Torvalds 已提交
2285 2286
			return -ENODEV;
		}
2287 2288 2289
		skb2 = skb_clone(skb, GFP_ATOMIC);
		if (!skb2) {
			read_unlock(&mrt_lock);
2290
			rcu_read_unlock();
2291 2292 2293
			return -ENOMEM;
		}

2294
		NETLINK_CB(skb2).portid = portid;
2295 2296
		skb_push(skb2, sizeof(struct iphdr));
		skb_reset_network_header(skb2);
2297 2298
		iph = ip_hdr(skb2);
		iph->ihl = sizeof(struct iphdr) >> 2;
2299 2300
		iph->saddr = saddr;
		iph->daddr = daddr;
2301
		iph->version = 0;
2302
		err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
L
Linus Torvalds 已提交
2303
		read_unlock(&mrt_lock);
2304
		rcu_read_unlock();
L
Linus Torvalds 已提交
2305 2306 2307
		return err;
	}

2308
	read_lock(&mrt_lock);
2309
	err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
L
Linus Torvalds 已提交
2310
	read_unlock(&mrt_lock);
2311
	rcu_read_unlock();
L
Linus Torvalds 已提交
2312 2313 2314
	return err;
}

2315
static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2316 2317
			    u32 portid, u32 seq, struct mfc_cache *c, int cmd,
			    int flags)
2318 2319 2320
{
	struct nlmsghdr *nlh;
	struct rtmsg *rtm;
2321
	int err;
2322

2323
	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
2324
	if (!nlh)
2325 2326 2327 2328 2329 2330 2331 2332
		return -EMSGSIZE;

	rtm = nlmsg_data(nlh);
	rtm->rtm_family   = RTNL_FAMILY_IPMR;
	rtm->rtm_dst_len  = 32;
	rtm->rtm_src_len  = 32;
	rtm->rtm_tos      = 0;
	rtm->rtm_table    = mrt->id;
D
David S. Miller 已提交
2333 2334
	if (nla_put_u32(skb, RTA_TABLE, mrt->id))
		goto nla_put_failure;
2335 2336
	rtm->rtm_type     = RTN_MULTICAST;
	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2337 2338 2339 2340
	if (c->mfc_flags & MFC_STATIC)
		rtm->rtm_protocol = RTPROT_STATIC;
	else
		rtm->rtm_protocol = RTPROT_MROUTED;
2341 2342
	rtm->rtm_flags    = 0;

2343 2344
	if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
	    nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
D
David S. Miller 已提交
2345
		goto nla_put_failure;
2346 2347 2348
	err = __ipmr_fill_mroute(mrt, skb, c, rtm);
	/* do not break the dump if cache is unresolved */
	if (err < 0 && err != -ENOENT)
2349 2350
		goto nla_put_failure;

2351 2352
	nlmsg_end(skb, nlh);
	return 0;
2353 2354 2355 2356 2357 2358

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373
static size_t mroute_msgsize(bool unresolved, int maxvif)
{
	size_t len =
		NLMSG_ALIGN(sizeof(struct rtmsg))
		+ nla_total_size(4)	/* RTA_TABLE */
		+ nla_total_size(4)	/* RTA_SRC */
		+ nla_total_size(4)	/* RTA_DST */
		;

	if (!unresolved)
		len = len
		      + nla_total_size(4)	/* RTA_IIF */
		      + nla_total_size(0)	/* RTA_MULTIPATH */
		      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
						/* RTA_MFC_STATS */
2374
		      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388
		;

	return len;
}

static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
				 int cmd)
{
	struct net *net = read_pnet(&mrt->net);
	struct sk_buff *skb;
	int err = -ENOBUFS;

	skb = nlmsg_new(mroute_msgsize(mfc->mfc_parent >= MAXVIFS, mrt->maxvif),
			GFP_ATOMIC);
2389
	if (!skb)
2390 2391
		goto errout;

2392
	err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404
	if (err < 0)
		goto errout;

	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
	return;

errout:
	kfree_skb(skb);
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
}

2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467
static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
{
	size_t len =
		NLMSG_ALIGN(sizeof(struct rtgenmsg))
		+ nla_total_size(1)	/* IPMRA_CREPORT_MSGTYPE */
		+ nla_total_size(4)	/* IPMRA_CREPORT_VIF_ID */
		+ nla_total_size(4)	/* IPMRA_CREPORT_SRC_ADDR */
		+ nla_total_size(4)	/* IPMRA_CREPORT_DST_ADDR */
					/* IPMRA_CREPORT_PKT */
		+ nla_total_size(payloadlen)
		;

	return len;
}

static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
{
	struct net *net = read_pnet(&mrt->net);
	struct nlmsghdr *nlh;
	struct rtgenmsg *rtgenm;
	struct igmpmsg *msg;
	struct sk_buff *skb;
	struct nlattr *nla;
	int payloadlen;

	payloadlen = pkt->len - sizeof(struct igmpmsg);
	msg = (struct igmpmsg *)skb_network_header(pkt);

	skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC);
	if (!skb)
		goto errout;

	nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
			sizeof(struct rtgenmsg), 0);
	if (!nlh)
		goto errout;
	rtgenm = nlmsg_data(nlh);
	rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
	if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
	    nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
	    nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
			    msg->im_src.s_addr) ||
	    nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
			    msg->im_dst.s_addr))
		goto nla_put_failure;

	nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
	if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
				  nla_data(nla), payloadlen))
		goto nla_put_failure;

	nlmsg_end(skb, nlh);

	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
	return;

nla_put_failure:
	nlmsg_cancel(skb, nlh);
errout:
	kfree_skb(skb);
	rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
}

D
Donald Sharp 已提交
2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492
static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
			     struct netlink_ext_ack *extack)
{
	struct net *net = sock_net(in_skb->sk);
	struct nlattr *tb[RTA_MAX + 1];
	struct sk_buff *skb = NULL;
	struct mfc_cache *cache;
	struct mr_table *mrt;
	struct rtmsg *rtm;
	__be32 src, grp;
	u32 tableid;
	int err;

	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
			  rtm_ipv4_policy, extack);
	if (err < 0)
		goto errout;

	rtm = nlmsg_data(nlh);

	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
	grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
	tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0;

	mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
2493 2494
	if (!mrt) {
		err = -ENOENT;
D
Donald Sharp 已提交
2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 2521 2522 2523 2524 2525 2526 2527 2528
		goto errout_free;
	}

	/* entries are added/deleted only under RTNL */
	rcu_read_lock();
	cache = ipmr_cache_find(mrt, src, grp);
	rcu_read_unlock();
	if (!cache) {
		err = -ENOENT;
		goto errout_free;
	}

	skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL);
	if (!skb) {
		err = -ENOBUFS;
		goto errout_free;
	}

	err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
			       nlh->nlmsg_seq, cache,
			       RTM_NEWROUTE, 0);
	if (err < 0)
		goto errout_free;

	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);

errout:
	return err;

errout_free:
	kfree_skb(skb);
	goto errout;
}

2529 2530 2531 2532 2533 2534 2535 2536 2537
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
	struct net *net = sock_net(skb->sk);
	struct mr_table *mrt;
	struct mfc_cache *mfc;
	unsigned int t = 0, s_t;
	unsigned int e = 0, s_e;

	s_t = cb->args[0];
2538
	s_e = cb->args[1];
2539

2540
	rcu_read_lock();
2541 2542 2543
	ipmr_for_each_table(mrt, net) {
		if (t < s_t)
			goto next_table;
2544 2545 2546 2547 2548 2549 2550 2551 2552
		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list) {
			if (e < s_e)
				goto next_entry;
			if (ipmr_fill_mroute(mrt, skb,
					     NETLINK_CB(cb->skb).portid,
					     cb->nlh->nlmsg_seq,
					     mfc, RTM_NEWROUTE,
					     NLM_F_MULTI) < 0)
				goto done;
2553
next_entry:
2554
			e++;
2555
		}
2556 2557 2558
		e = 0;
		s_e = 0;

2559 2560 2561 2562 2563 2564 2565
		spin_lock_bh(&mfc_unres_lock);
		list_for_each_entry(mfc, &mrt->mfc_unres_queue, list) {
			if (e < s_e)
				goto next_entry2;
			if (ipmr_fill_mroute(mrt, skb,
					     NETLINK_CB(cb->skb).portid,
					     cb->nlh->nlmsg_seq,
2566 2567
					     mfc, RTM_NEWROUTE,
					     NLM_F_MULTI) < 0) {
2568 2569 2570 2571 2572 2573 2574
				spin_unlock_bh(&mfc_unres_lock);
				goto done;
			}
next_entry2:
			e++;
		}
		spin_unlock_bh(&mfc_unres_lock);
2575 2576
		e = 0;
		s_e = 0;
2577 2578 2579 2580
next_table:
		t++;
	}
done:
2581
	rcu_read_unlock();
2582

2583
	cb->args[1] = e;
2584 2585 2586 2587 2588
	cb->args[0] = t;

	return skb->len;
}

2589 2590 2591 2592 2593 2594 2595 2596 2597 2598 2599 2600 2601 2602 2603 2604 2605 2606 2607 2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624
static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
	[RTA_SRC]	= { .type = NLA_U32 },
	[RTA_DST]	= { .type = NLA_U32 },
	[RTA_IIF]	= { .type = NLA_U32 },
	[RTA_TABLE]	= { .type = NLA_U32 },
	[RTA_MULTIPATH]	= { .len = sizeof(struct rtnexthop) },
};

static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
{
	switch (rtm_protocol) {
	case RTPROT_STATIC:
	case RTPROT_MROUTED:
		return true;
	}
	return false;
}

static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
{
	struct rtnexthop *rtnh = nla_data(nla);
	int remaining = nla_len(nla), vifi = 0;

	while (rtnh_ok(rtnh, remaining)) {
		mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
		if (++vifi == MAXVIFS)
			break;
		rtnh = rtnh_next(rtnh, &remaining);
	}

	return remaining > 0 ? -EINVAL : vifi;
}

/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
			    struct mfcctl *mfcc, int *mrtsock,
2625 2626
			    struct mr_table **mrtret,
			    struct netlink_ext_ack *extack)
2627 2628 2629 2630 2631 2632 2633 2634
{
	struct net_device *dev = NULL;
	u32 tblid = RT_TABLE_DEFAULT;
	struct mr_table *mrt;
	struct nlattr *attr;
	struct rtmsg *rtm;
	int ret, rem;

2635
	ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy,
2636
			     extack);
2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 2681 2682 2683 2684 2685 2686 2687 2688 2689 2690 2691 2692 2693 2694
	if (ret < 0)
		goto out;
	rtm = nlmsg_data(nlh);

	ret = -EINVAL;
	if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
	    rtm->rtm_type != RTN_MULTICAST ||
	    rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
	    !ipmr_rtm_validate_proto(rtm->rtm_protocol))
		goto out;

	memset(mfcc, 0, sizeof(*mfcc));
	mfcc->mfcc_parent = -1;
	ret = 0;
	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
		switch (nla_type(attr)) {
		case RTA_SRC:
			mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
			break;
		case RTA_DST:
			mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
			break;
		case RTA_IIF:
			dev = __dev_get_by_index(net, nla_get_u32(attr));
			if (!dev) {
				ret = -ENODEV;
				goto out;
			}
			break;
		case RTA_MULTIPATH:
			if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
				ret = -EINVAL;
				goto out;
			}
			break;
		case RTA_PREFSRC:
			ret = 1;
			break;
		case RTA_TABLE:
			tblid = nla_get_u32(attr);
			break;
		}
	}
	mrt = ipmr_get_table(net, tblid);
	if (!mrt) {
		ret = -ENOENT;
		goto out;
	}
	*mrtret = mrt;
	*mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
	if (dev)
		mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);

out:
	return ret;
}

/* takes care of both newroute and delroute */
2695 2696
static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
			  struct netlink_ext_ack *extack)
2697 2698 2699 2700 2701 2702 2703 2704
{
	struct net *net = sock_net(skb->sk);
	int ret, mrtsock, parent;
	struct mr_table *tbl;
	struct mfcctl mfcc;

	mrtsock = 0;
	tbl = NULL;
2705
	ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack);
2706 2707 2708 2709 2710 2711 2712 2713 2714 2715
	if (ret < 0)
		return ret;

	parent = ret ? mfcc.mfcc_parent : -1;
	if (nlh->nlmsg_type == RTM_NEWROUTE)
		return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
	else
		return ipmr_mfc_delete(tbl, &mfcc, parent);
}

2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838
static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
{
	u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len);

	if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) ||
	    nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) ||
	    nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM,
			mrt->mroute_reg_vif_num) ||
	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
		       mrt->mroute_do_assert) ||
	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
		return false;

	return true;
}

static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
{
	struct nlattr *vif_nest;
	struct vif_device *vif;

	/* if the VIF doesn't exist just continue */
	if (!VIF_EXISTS(mrt, vifid))
		return true;

	vif = &mrt->vif_table[vifid];
	vif_nest = nla_nest_start(skb, IPMRA_VIF);
	if (!vif_nest)
		return false;
	if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
	    nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
	    nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
			      IPMRA_VIFA_PAD) ||
	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out,
			      IPMRA_VIFA_PAD) ||
	    nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in,
			      IPMRA_VIFA_PAD) ||
	    nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out,
			      IPMRA_VIFA_PAD) ||
	    nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) ||
	    nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) {
		nla_nest_cancel(skb, vif_nest);
		return false;
	}
	nla_nest_end(skb, vif_nest);

	return true;
}

static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
{
	struct net *net = sock_net(skb->sk);
	struct nlmsghdr *nlh = NULL;
	unsigned int t = 0, s_t;
	unsigned int e = 0, s_e;
	struct mr_table *mrt;

	s_t = cb->args[0];
	s_e = cb->args[1];

	ipmr_for_each_table(mrt, net) {
		struct nlattr *vifs, *af;
		struct ifinfomsg *hdr;
		u32 i;

		if (t < s_t)
			goto skip_table;
		nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
				cb->nlh->nlmsg_seq, RTM_NEWLINK,
				sizeof(*hdr), NLM_F_MULTI);
		if (!nlh)
			break;

		hdr = nlmsg_data(nlh);
		memset(hdr, 0, sizeof(*hdr));
		hdr->ifi_family = RTNL_FAMILY_IPMR;

		af = nla_nest_start(skb, IFLA_AF_SPEC);
		if (!af) {
			nlmsg_cancel(skb, nlh);
			goto out;
		}

		if (!ipmr_fill_table(mrt, skb)) {
			nlmsg_cancel(skb, nlh);
			goto out;
		}

		vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS);
		if (!vifs) {
			nla_nest_end(skb, af);
			nlmsg_end(skb, nlh);
			goto out;
		}
		for (i = 0; i < mrt->maxvif; i++) {
			if (e < s_e)
				goto skip_entry;
			if (!ipmr_fill_vif(mrt, i, skb)) {
				nla_nest_end(skb, vifs);
				nla_nest_end(skb, af);
				nlmsg_end(skb, nlh);
				goto out;
			}
skip_entry:
			e++;
		}
		s_e = 0;
		e = 0;
		nla_nest_end(skb, vifs);
		nla_nest_end(skb, af);
		nlmsg_end(skb, nlh);
skip_table:
		t++;
	}

out:
	cb->args[1] = e;
	cb->args[0] = t;

	return skb->len;
}

2839
#ifdef CONFIG_PROC_FS
2840 2841
/* The /proc interfaces to multicast routing :
 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
L
Linus Torvalds 已提交
2842 2843
 */
struct ipmr_vif_iter {
2844
	struct seq_net_private p;
2845
	struct mr_table *mrt;
L
Linus Torvalds 已提交
2846 2847 2848
	int ct;
};

2849 2850
static struct vif_device *ipmr_vif_seq_idx(struct net *net,
					   struct ipmr_vif_iter *iter,
L
Linus Torvalds 已提交
2851 2852
					   loff_t pos)
{
2853
	struct mr_table *mrt = iter->mrt;
2854 2855 2856

	for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
		if (!VIF_EXISTS(mrt, iter->ct))
L
Linus Torvalds 已提交
2857
			continue;
2858
		if (pos-- == 0)
2859
			return &mrt->vif_table[iter->ct];
L
Linus Torvalds 已提交
2860 2861 2862 2863 2864
	}
	return NULL;
}

static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
S
Stephen Hemminger 已提交
2865
	__acquires(mrt_lock)
L
Linus Torvalds 已提交
2866
{
2867
	struct ipmr_vif_iter *iter = seq->private;
2868
	struct net *net = seq_file_net(seq);
2869 2870 2871
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2872
	if (!mrt)
2873 2874 2875
		return ERR_PTR(-ENOENT);

	iter->mrt = mrt;
2876

L
Linus Torvalds 已提交
2877
	read_lock(&mrt_lock);
2878
	return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
L
Linus Torvalds 已提交
2879 2880 2881 2882 2883 2884
		: SEQ_START_TOKEN;
}

static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct ipmr_vif_iter *iter = seq->private;
2885
	struct net *net = seq_file_net(seq);
2886
	struct mr_table *mrt = iter->mrt;
L
Linus Torvalds 已提交
2887 2888 2889

	++*pos;
	if (v == SEQ_START_TOKEN)
2890
		return ipmr_vif_seq_idx(net, iter, 0);
2891

2892 2893
	while (++iter->ct < mrt->maxvif) {
		if (!VIF_EXISTS(mrt, iter->ct))
L
Linus Torvalds 已提交
2894
			continue;
2895
		return &mrt->vif_table[iter->ct];
L
Linus Torvalds 已提交
2896 2897 2898 2899 2900
	}
	return NULL;
}

static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
S
Stephen Hemminger 已提交
2901
	__releases(mrt_lock)
L
Linus Torvalds 已提交
2902 2903 2904 2905 2906 2907
{
	read_unlock(&mrt_lock);
}

static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
{
2908 2909
	struct ipmr_vif_iter *iter = seq->private;
	struct mr_table *mrt = iter->mrt;
2910

L
Linus Torvalds 已提交
2911
	if (v == SEQ_START_TOKEN) {
2912
		seq_puts(seq,
L
Linus Torvalds 已提交
2913 2914 2915 2916 2917 2918
			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
	} else {
		const struct vif_device *vif = v;
		const char *name =  vif->dev ? vif->dev->name : "none";

		seq_printf(seq,
2919
			   "%2zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2920
			   vif - mrt->vif_table,
2921
			   name, vif->bytes_in, vif->pkt_in,
L
Linus Torvalds 已提交
2922 2923 2924 2925 2926 2927
			   vif->bytes_out, vif->pkt_out,
			   vif->flags, vif->local, vif->remote);
	}
	return 0;
}

2928
static const struct seq_operations ipmr_vif_seq_ops = {
L
Linus Torvalds 已提交
2929 2930 2931 2932 2933 2934 2935 2936
	.start = ipmr_vif_seq_start,
	.next  = ipmr_vif_seq_next,
	.stop  = ipmr_vif_seq_stop,
	.show  = ipmr_vif_seq_show,
};

static int ipmr_vif_open(struct inode *inode, struct file *file)
{
2937 2938
	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
			    sizeof(struct ipmr_vif_iter));
L
Linus Torvalds 已提交
2939 2940
}

2941
static const struct file_operations ipmr_vif_fops = {
L
Linus Torvalds 已提交
2942 2943 2944 2945
	.owner	 = THIS_MODULE,
	.open    = ipmr_vif_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
2946
	.release = seq_release_net,
L
Linus Torvalds 已提交
2947 2948 2949
};

struct ipmr_mfc_iter {
2950
	struct seq_net_private p;
2951
	struct mr_table *mrt;
2952
	struct list_head *cache;
L
Linus Torvalds 已提交
2953 2954
};

2955 2956
static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
					  struct ipmr_mfc_iter *it, loff_t pos)
L
Linus Torvalds 已提交
2957
{
2958
	struct mr_table *mrt = it->mrt;
L
Linus Torvalds 已提交
2959 2960
	struct mfc_cache *mfc;

2961
	rcu_read_lock();
2962 2963 2964 2965
	it->cache = &mrt->mfc_cache_list;
	list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
		if (pos-- == 0)
			return mfc;
2966
	rcu_read_unlock();
L
Linus Torvalds 已提交
2967 2968

	spin_lock_bh(&mfc_unres_lock);
2969
	it->cache = &mrt->mfc_unres_queue;
2970
	list_for_each_entry(mfc, it->cache, list)
2971
		if (pos-- == 0)
L
Linus Torvalds 已提交
2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 2982
			return mfc;
	spin_unlock_bh(&mfc_unres_lock);

	it->cache = NULL;
	return NULL;
}


static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
{
	struct ipmr_mfc_iter *it = seq->private;
2983
	struct net *net = seq_file_net(seq);
2984
	struct mr_table *mrt;
2985

2986
	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2987
	if (!mrt)
2988
		return ERR_PTR(-ENOENT);
2989

2990
	it->mrt = mrt;
L
Linus Torvalds 已提交
2991
	it->cache = NULL;
2992
	return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
L
Linus Torvalds 已提交
2993 2994 2995 2996 2997 2998
		: SEQ_START_TOKEN;
}

static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
	struct ipmr_mfc_iter *it = seq->private;
2999
	struct net *net = seq_file_net(seq);
3000
	struct mr_table *mrt = it->mrt;
3001
	struct mfc_cache *mfc = v;
L
Linus Torvalds 已提交
3002 3003 3004 3005

	++*pos;

	if (v == SEQ_START_TOKEN)
3006
		return ipmr_mfc_seq_idx(net, seq->private, 0);
L
Linus Torvalds 已提交
3007

3008 3009
	if (mfc->list.next != it->cache)
		return list_entry(mfc->list.next, struct mfc_cache, list);
3010

3011
	if (it->cache == &mrt->mfc_unres_queue)
L
Linus Torvalds 已提交
3012 3013 3014
		goto end_of_list;

	/* exhausted cache_array, show unresolved */
3015
	rcu_read_unlock();
3016
	it->cache = &mrt->mfc_unres_queue;
3017

L
Linus Torvalds 已提交
3018
	spin_lock_bh(&mfc_unres_lock);
3019 3020
	if (!list_empty(it->cache))
		return list_first_entry(it->cache, struct mfc_cache, list);
L
Linus Torvalds 已提交
3021

E
Eric Dumazet 已提交
3022
end_of_list:
L
Linus Torvalds 已提交
3023 3024 3025 3026 3027 3028 3029 3030 3031
	spin_unlock_bh(&mfc_unres_lock);
	it->cache = NULL;

	return NULL;
}

static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
{
	struct ipmr_mfc_iter *it = seq->private;
3032
	struct mr_table *mrt = it->mrt;
L
Linus Torvalds 已提交
3033

3034
	if (it->cache == &mrt->mfc_unres_queue)
L
Linus Torvalds 已提交
3035
		spin_unlock_bh(&mfc_unres_lock);
3036
	else if (it->cache == &mrt->mfc_cache_list)
3037
		rcu_read_unlock();
L
Linus Torvalds 已提交
3038 3039 3040 3041 3042 3043 3044
}

static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
{
	int n;

	if (v == SEQ_START_TOKEN) {
3045
		seq_puts(seq,
L
Linus Torvalds 已提交
3046 3047 3048 3049
		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
	} else {
		const struct mfc_cache *mfc = v;
		const struct ipmr_mfc_iter *it = seq->private;
3050
		const struct mr_table *mrt = it->mrt;
3051

3052 3053 3054
		seq_printf(seq, "%08X %08X %-3hd",
			   (__force u32) mfc->mfc_mcastgrp,
			   (__force u32) mfc->mfc_origin,
3055
			   mfc->mfc_parent);
L
Linus Torvalds 已提交
3056

3057
		if (it->cache != &mrt->mfc_unres_queue) {
3058 3059 3060 3061
			seq_printf(seq, " %8lu %8lu %8lu",
				   mfc->mfc_un.res.pkt,
				   mfc->mfc_un.res.bytes,
				   mfc->mfc_un.res.wrong_if);
S
Stephen Hemminger 已提交
3062
			for (n = mfc->mfc_un.res.minvif;
E
Eric Dumazet 已提交
3063
			     n < mfc->mfc_un.res.maxvif; n++) {
3064
				if (VIF_EXISTS(mrt, n) &&
3065 3066
				    mfc->mfc_un.res.ttls[n] < 255)
					seq_printf(seq,
3067
					   " %2d:%-3d",
L
Linus Torvalds 已提交
3068 3069
					   n, mfc->mfc_un.res.ttls[n]);
			}
3070 3071 3072 3073 3074
		} else {
			/* unresolved mfc_caches don't contain
			 * pkt, bytes and wrong_if values
			 */
			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
L
Linus Torvalds 已提交
3075 3076 3077 3078 3079 3080
		}
		seq_putc(seq, '\n');
	}
	return 0;
}

3081
static const struct seq_operations ipmr_mfc_seq_ops = {
L
Linus Torvalds 已提交
3082 3083 3084 3085 3086 3087 3088 3089
	.start = ipmr_mfc_seq_start,
	.next  = ipmr_mfc_seq_next,
	.stop  = ipmr_mfc_seq_stop,
	.show  = ipmr_mfc_seq_show,
};

static int ipmr_mfc_open(struct inode *inode, struct file *file)
{
3090 3091
	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
			    sizeof(struct ipmr_mfc_iter));
L
Linus Torvalds 已提交
3092 3093
}

3094
static const struct file_operations ipmr_mfc_fops = {
L
Linus Torvalds 已提交
3095 3096 3097 3098
	.owner	 = THIS_MODULE,
	.open    = ipmr_mfc_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
3099
	.release = seq_release_net,
L
Linus Torvalds 已提交
3100
};
3101
#endif
L
Linus Torvalds 已提交
3102 3103

#ifdef CONFIG_IP_PIMSM_V2
3104
static const struct net_protocol pim_protocol = {
L
Linus Torvalds 已提交
3105
	.handler	=	pim_rcv,
T
Tom Goff 已提交
3106
	.netns_ok	=	1,
L
Linus Torvalds 已提交
3107 3108 3109
};
#endif

3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178
static unsigned int ipmr_seq_read(struct net *net)
{
	ASSERT_RTNL();

	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
}

static int ipmr_dump(struct net *net, struct notifier_block *nb)
{
	struct mr_table *mrt;
	int err;

	err = ipmr_rules_dump(net, nb);
	if (err)
		return err;

	ipmr_for_each_table(mrt, net) {
		struct vif_device *v = &mrt->vif_table[0];
		struct mfc_cache *mfc;
		int vifi;

		/* Notifiy on table VIF entries */
		read_lock(&mrt_lock);
		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
			if (!v->dev)
				continue;

			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
						     v, vifi, mrt->id);
		}
		read_unlock(&mrt_lock);

		/* Notify on table MFC entries */
		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
			call_ipmr_mfc_entry_notifier(nb, net,
						     FIB_EVENT_ENTRY_ADD, mfc,
						     mrt->id);
	}

	return 0;
}

static const struct fib_notifier_ops ipmr_notifier_ops_template = {
	.family		= RTNL_FAMILY_IPMR,
	.fib_seq_read	= ipmr_seq_read,
	.fib_dump	= ipmr_dump,
	.owner		= THIS_MODULE,
};

int __net_init ipmr_notifier_init(struct net *net)
{
	struct fib_notifier_ops *ops;

	net->ipv4.ipmr_seq = 0;

	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
	if (IS_ERR(ops))
		return PTR_ERR(ops);
	net->ipv4.ipmr_notifier_ops = ops;

	return 0;
}

static void __net_exit ipmr_notifier_exit(struct net *net)
{
	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
	net->ipv4.ipmr_notifier_ops = NULL;
}

3179
/* Setup for IP multicast routing */
3180 3181
static int __net_init ipmr_net_init(struct net *net)
{
3182
	int err;
3183

3184 3185 3186 3187
	err = ipmr_notifier_init(net);
	if (err)
		goto ipmr_notifier_fail;

3188 3189
	err = ipmr_rules_init(net);
	if (err < 0)
3190
		goto ipmr_rules_fail;
3191 3192 3193

#ifdef CONFIG_PROC_FS
	err = -ENOMEM;
3194
	if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
3195
		goto proc_vif_fail;
3196
	if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
3197 3198
		goto proc_cache_fail;
#endif
3199 3200
	return 0;

3201 3202
#ifdef CONFIG_PROC_FS
proc_cache_fail:
3203
	remove_proc_entry("ip_mr_vif", net->proc_net);
3204
proc_vif_fail:
3205
	ipmr_rules_exit(net);
3206
#endif
3207 3208 3209
ipmr_rules_fail:
	ipmr_notifier_exit(net);
ipmr_notifier_fail:
3210 3211 3212 3213 3214
	return err;
}

static void __net_exit ipmr_net_exit(struct net *net)
{
3215
#ifdef CONFIG_PROC_FS
3216 3217
	remove_proc_entry("ip_mr_cache", net->proc_net);
	remove_proc_entry("ip_mr_vif", net->proc_net);
3218
#endif
3219
	ipmr_notifier_exit(net);
3220
	ipmr_rules_exit(net);
3221 3222 3223 3224 3225 3226
}

static struct pernet_operations ipmr_net_ops = {
	.init = ipmr_net_init,
	.exit = ipmr_net_exit,
};
3227

W
Wang Chen 已提交
3228
int __init ip_mr_init(void)
L
Linus Torvalds 已提交
3229
{
W
Wang Chen 已提交
3230 3231
	int err;

L
Linus Torvalds 已提交
3232 3233
	mrt_cachep = kmem_cache_create("ip_mrt_cache",
				       sizeof(struct mfc_cache),
3234
				       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
3235
				       NULL);
W
Wang Chen 已提交
3236

3237 3238 3239 3240
	err = register_pernet_subsys(&ipmr_net_ops);
	if (err)
		goto reg_pernet_fail;

W
Wang Chen 已提交
3241 3242 3243
	err = register_netdevice_notifier(&ip_mr_notifier);
	if (err)
		goto reg_notif_fail;
T
Tom Goff 已提交
3244 3245
#ifdef CONFIG_IP_PIMSM_V2
	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
J
Joe Perches 已提交
3246
		pr_err("%s: can't add PIM protocol\n", __func__);
T
Tom Goff 已提交
3247 3248 3249 3250
		err = -EAGAIN;
		goto add_proto_fail;
	}
#endif
3251
	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
3252
		      ipmr_rtm_getroute, ipmr_rtm_dumproute, 0);
3253
	rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
3254
		      ipmr_rtm_route, NULL, 0);
3255
	rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
3256
		      ipmr_rtm_route, NULL, 0);
3257 3258

	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
3259
		      NULL, ipmr_rtm_dumplink, 0);
W
Wang Chen 已提交
3260
	return 0;
3261

T
Tom Goff 已提交
3262 3263 3264 3265
#ifdef CONFIG_IP_PIMSM_V2
add_proto_fail:
	unregister_netdevice_notifier(&ip_mr_notifier);
#endif
B
Benjamin Thery 已提交
3266
reg_notif_fail:
3267 3268
	unregister_pernet_subsys(&ipmr_net_ops);
reg_pernet_fail:
B
Benjamin Thery 已提交
3269
	kmem_cache_destroy(mrt_cachep);
W
Wang Chen 已提交
3270
	return err;
L
Linus Torvalds 已提交
3271
}