ipmr.c 74.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3
/*
 *	IP multicast routing support for mrouted 3.6/3.8
 *
4
 *		(c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
L
Linus Torvalds 已提交
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 *	  Linux Consultancy and Custom Driver Development
 *
 *	This program is free software; you can redistribute it and/or
 *	modify it under the terms of the GNU General Public License
 *	as published by the Free Software Foundation; either version
 *	2 of the License, or (at your option) any later version.
 *
 *	Fixes:
 *	Michael Chastain	:	Incorrect size of copying.
 *	Alan Cox		:	Added the cache manager code
 *	Alan Cox		:	Fixed the clone/copy bug and device race.
 *	Mike McLagan		:	Routing by source
 *	Malcolm Beattie		:	Buffer handling fixes.
 *	Alexey Kuznetsov	:	Double buffer free and other fixes.
 *	SVR Anand		:	Fixed several multicast bugs and problems.
 *	Alexey Kuznetsov	:	Status, optimisations and more.
 *	Brad Parker		:	Better behaviour on mrouted upcall
 *					overflow.
 *      Carlos Picoto           :       PIMv1 Support
 *	Pavlin Ivanov Radoslavov:	PIMv2 Registers must checksum only PIM header
25
 *					Relax this requirement to work with older peers.
L
Linus Torvalds 已提交
26 27 28
 *
 */

29
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
30
#include <linux/types.h>
31
#include <linux/cache.h>
32
#include <linux/capability.h>
L
Linus Torvalds 已提交
33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/fcntl.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/in.h>
#include <linux/inet.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/igmp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/mroute.h>
#include <linux/init.h>
48
#include <linux/if_ether.h>
49
#include <linux/slab.h>
50
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
51 52 53
#include <net/ip.h>
#include <net/protocol.h>
#include <linux/skbuff.h>
54
#include <net/route.h>
L
Linus Torvalds 已提交
55 56 57 58 59 60
#include <net/icmp.h>
#include <net/udp.h>
#include <net/raw.h>
#include <linux/notifier.h>
#include <linux/if_arp.h>
#include <linux/netfilter_ipv4.h>
61
#include <linux/compat.h>
62
#include <linux/export.h>
63
#include <net/ip_tunnels.h>
L
Linus Torvalds 已提交
64
#include <net/checksum.h>
65
#include <net/netlink.h>
66
#include <net/fib_rules.h>
67
#include <linux/netconf.h>
68
#include <net/nexthop.h>
69
#include <net/switchdev.h>
L
Linus Torvalds 已提交
70

71 72 73 74 75 76 77 78
struct ipmr_rule {
	struct fib_rule		common;
};

struct ipmr_result {
	struct mr_table		*mrt;
};

L
Linus Torvalds 已提交
79
/* Big lock, protecting vif table, mrt cache and mroute socket state.
E
Eric Dumazet 已提交
80
 * Note that the changes are semaphored via rtnl_lock.
L
Linus Torvalds 已提交
81 82 83 84
 */

static DEFINE_RWLOCK(mrt_lock);

85
/* Multicast router control variables */
L
Linus Torvalds 已提交
86 87 88 89 90

/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);

/* We return to original Alan's scheme. Hash table of resolved
E
Eric Dumazet 已提交
91 92 93 94 95
 * entries is changed only in process context and protected
 * with weak lock mrt_lock. Queue of unresolved entries is protected
 * with strong spinlock mfc_unres_lock.
 *
 * In this case data path is free of exclusive locks at all.
L
Linus Torvalds 已提交
96 97
 */

98
static struct kmem_cache *mrt_cachep __ro_after_init;
L
Linus Torvalds 已提交
99

100
static struct mr_table *ipmr_new_table(struct net *net, u32 id);
101 102
static void ipmr_free_table(struct mr_table *mrt);

103
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
104 105
			  struct net_device *dev, struct sk_buff *skb,
			  struct mfc_cache *cache, int local);
106
static int ipmr_cache_report(struct mr_table *mrt,
107
			     struct sk_buff *pkt, vifi_t vifi, int assert);
108 109
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
				 int cmd);
110
static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt);
111
static void mroute_clean_tables(struct mr_table *mrt, bool all);
112
static void ipmr_expire_process(struct timer_list *t);
113 114 115 116 117

#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
#define ipmr_for_each_table(mrt, net) \
	list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)

Y
Yuval Mintz 已提交
118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134
static struct mr_table *ipmr_mr_table_iter(struct net *net,
					   struct mr_table *mrt)
{
	struct mr_table *ret;

	if (!mrt)
		ret = list_entry_rcu(net->ipv4.mr_tables.next,
				     struct mr_table, list);
	else
		ret = list_entry_rcu(mrt->list.next,
				     struct mr_table, list);

	if (&ret->list == &net->ipv4.mr_tables)
		return NULL;
	return ret;
}

135 136 137 138 139 140 141 142 143 144 145
static struct mr_table *ipmr_get_table(struct net *net, u32 id)
{
	struct mr_table *mrt;

	ipmr_for_each_table(mrt, net) {
		if (mrt->id == id)
			return mrt;
	}
	return NULL;
}

D
David S. Miller 已提交
146
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
147 148 149
			   struct mr_table **mrt)
{
	int err;
150 151 152 153 154
	struct ipmr_result res;
	struct fib_lookup_arg arg = {
		.result = &res,
		.flags = FIB_LOOKUP_NOREF,
	};
155

156 157 158
	/* update flow if oif or iif point to device enslaved to l3mdev */
	l3mdev_update_flow(net, flowi4_to_flowi(flp4));

D
David S. Miller 已提交
159 160
	err = fib_rules_lookup(net->ipv4.mr_rules_ops,
			       flowi4_to_flowi(flp4), 0, &arg);
161 162 163 164 165 166 167 168 169 170 171
	if (err < 0)
		return err;
	*mrt = res.mrt;
	return 0;
}

static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
			    int flags, struct fib_lookup_arg *arg)
{
	struct ipmr_result *res = arg->result;
	struct mr_table *mrt;
L
Linus Torvalds 已提交
172

173 174 175 176 177 178 179 180 181 182 183 184
	switch (rule->action) {
	case FR_ACT_TO_TBL:
		break;
	case FR_ACT_UNREACHABLE:
		return -ENETUNREACH;
	case FR_ACT_PROHIBIT:
		return -EACCES;
	case FR_ACT_BLACKHOLE:
	default:
		return -EINVAL;
	}

185 186 187
	arg->table = fib_rule_get_table(rule, arg);

	mrt = ipmr_get_table(rule->fr_net, arg->table);
188
	if (!mrt)
189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
		return -EAGAIN;
	res->mrt = mrt;
	return 0;
}

static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
{
	return 1;
}

static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
	FRA_GENERIC_POLICY,
};

static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
			       struct fib_rule_hdr *frh, struct nlattr **tb)
{
	return 0;
}

static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
			     struct nlattr **tb)
{
	return 1;
}

static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
			  struct fib_rule_hdr *frh)
{
	frh->dst_len = 0;
	frh->src_len = 0;
	frh->tos     = 0;
	return 0;
}

224
static const struct fib_rules_ops __net_initconst ipmr_rules_ops_template = {
225
	.family		= RTNL_FAMILY_IPMR,
226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250
	.rule_size	= sizeof(struct ipmr_rule),
	.addr_size	= sizeof(u32),
	.action		= ipmr_rule_action,
	.match		= ipmr_rule_match,
	.configure	= ipmr_rule_configure,
	.compare	= ipmr_rule_compare,
	.fill		= ipmr_rule_fill,
	.nlgroup	= RTNLGRP_IPV4_RULE,
	.policy		= ipmr_rule_policy,
	.owner		= THIS_MODULE,
};

static int __net_init ipmr_rules_init(struct net *net)
{
	struct fib_rules_ops *ops;
	struct mr_table *mrt;
	int err;

	ops = fib_rules_register(&ipmr_rules_ops_template, net);
	if (IS_ERR(ops))
		return PTR_ERR(ops);

	INIT_LIST_HEAD(&net->ipv4.mr_tables);

	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
251 252
	if (IS_ERR(mrt)) {
		err = PTR_ERR(mrt);
253 254 255 256 257 258 259 260 261 262 263
		goto err1;
	}

	err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
	if (err < 0)
		goto err2;

	net->ipv4.mr_rules_ops = ops;
	return 0;

err2:
264
	ipmr_free_table(mrt);
265 266 267 268 269 270 271 272 273
err1:
	fib_rules_unregister(ops);
	return err;
}

static void __net_exit ipmr_rules_exit(struct net *net)
{
	struct mr_table *mrt, *next;

274
	rtnl_lock();
E
Eric Dumazet 已提交
275 276
	list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
		list_del(&mrt->list);
277
		ipmr_free_table(mrt);
E
Eric Dumazet 已提交
278
	}
279
	fib_rules_unregister(net->ipv4.mr_rules_ops);
280
	rtnl_unlock();
281
}
282 283 284 285 286 287 288 289 290 291

static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
{
	return fib_rules_dump(net, nb, RTNL_FAMILY_IPMR);
}

static unsigned int ipmr_rules_seq_read(struct net *net)
{
	return fib_rules_seq_read(net, RTNL_FAMILY_IPMR);
}
292 293 294 295 296 297

bool ipmr_rule_default(const struct fib_rule *rule)
{
	return fib_rule_matchall(rule) && rule->table == RT_TABLE_DEFAULT;
}
EXPORT_SYMBOL(ipmr_rule_default);
298 299 300 301
#else
#define ipmr_for_each_table(mrt, net) \
	for (mrt = net->ipv4.mrt; mrt; mrt = NULL)

Y
Yuval Mintz 已提交
302 303 304 305 306 307 308 309
static struct mr_table *ipmr_mr_table_iter(struct net *net,
					   struct mr_table *mrt)
{
	if (!mrt)
		return net->ipv4.mrt;
	return NULL;
}

310 311 312 313 314
static struct mr_table *ipmr_get_table(struct net *net, u32 id)
{
	return net->ipv4.mrt;
}

D
David S. Miller 已提交
315
static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
316 317 318 319 320 321 322 323
			   struct mr_table **mrt)
{
	*mrt = net->ipv4.mrt;
	return 0;
}

static int __net_init ipmr_rules_init(struct net *net)
{
324 325 326 327 328 329 330
	struct mr_table *mrt;

	mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
	if (IS_ERR(mrt))
		return PTR_ERR(mrt);
	net->ipv4.mrt = mrt;
	return 0;
331 332 333 334
}

static void __net_exit ipmr_rules_exit(struct net *net)
{
335
	rtnl_lock();
336
	ipmr_free_table(net->ipv4.mrt);
337 338
	net->ipv4.mrt = NULL;
	rtnl_unlock();
339
}
340 341 342 343 344 345 346 347 348 349

static int ipmr_rules_dump(struct net *net, struct notifier_block *nb)
{
	return 0;
}

static unsigned int ipmr_rules_seq_read(struct net *net)
{
	return 0;
}
350 351 352 353 354 355

bool ipmr_rule_default(const struct fib_rule *rule)
{
	return true;
}
EXPORT_SYMBOL(ipmr_rule_default);
356 357
#endif

358 359 360 361 362 363 364 365 366 367 368
static inline int ipmr_hash_cmp(struct rhashtable_compare_arg *arg,
				const void *ptr)
{
	const struct mfc_cache_cmp_arg *cmparg = arg->key;
	struct mfc_cache *c = (struct mfc_cache *)ptr;

	return cmparg->mfc_mcastgrp != c->mfc_mcastgrp ||
	       cmparg->mfc_origin != c->mfc_origin;
}

static const struct rhashtable_params ipmr_rht_params = {
369
	.head_offset = offsetof(struct mr_mfc, mnode),
370 371 372 373 374 375 376 377
	.key_offset = offsetof(struct mfc_cache, cmparg),
	.key_len = sizeof(struct mfc_cache_cmp_arg),
	.nelem_hint = 3,
	.locks_mul = 1,
	.obj_cmpfn = ipmr_hash_cmp,
	.automatic_shrinking = true,
};

378 379 380 381 382 383 384 385
static void ipmr_new_table_set(struct mr_table *mrt,
			       struct net *net)
{
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
	list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
#endif
}

386 387 388 389 390 391 392 393 394 395
static struct mfc_cache_cmp_arg ipmr_mr_table_ops_cmparg_any = {
	.mfc_mcastgrp = htonl(INADDR_ANY),
	.mfc_origin = htonl(INADDR_ANY),
};

static struct mr_table_ops ipmr_mr_table_ops = {
	.rht_params = &ipmr_rht_params,
	.cmparg_any = &ipmr_mr_table_ops_cmparg_any,
};

396 397 398
static struct mr_table *ipmr_new_table(struct net *net, u32 id)
{
	struct mr_table *mrt;
L
Linus Torvalds 已提交
399

400 401 402 403
	/* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
	if (id != RT_TABLE_DEFAULT && id >= 1000000000)
		return ERR_PTR(-EINVAL);

404
	mrt = ipmr_get_table(net, id);
405
	if (mrt)
406 407
		return mrt;

408
	return mr_table_alloc(net, id, &ipmr_mr_table_ops,
409
			      ipmr_expire_process, ipmr_new_table_set);
410
}
L
Linus Torvalds 已提交
411

412 413 414
static void ipmr_free_table(struct mr_table *mrt)
{
	del_timer_sync(&mrt->ipmr_expire_timer);
415
	mroute_clean_tables(mrt, true);
416
	rhltable_destroy(&mrt->mfc_hash);
417 418 419
	kfree(mrt);
}

L
Linus Torvalds 已提交
420 421
/* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */

422 423
static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
{
424 425
	struct net *net = dev_net(dev);

426 427
	dev_close(dev);

428
	dev = __dev_get_by_name(net, "tunl0");
429
	if (dev) {
430
		const struct net_device_ops *ops = dev->netdev_ops;
431 432 433 434 435 436 437 438 439 440 441 442
		struct ifreq ifr;
		struct ip_tunnel_parm p;

		memset(&p, 0, sizeof(p));
		p.iph.daddr = v->vifc_rmt_addr.s_addr;
		p.iph.saddr = v->vifc_lcl_addr.s_addr;
		p.iph.version = 4;
		p.iph.ihl = 5;
		p.iph.protocol = IPPROTO_IPIP;
		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;

443 444 445 446 447 448 449
		if (ops->ndo_do_ioctl) {
			mm_segment_t oldfs = get_fs();

			set_fs(KERNEL_DS);
			ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
			set_fs(oldfs);
		}
450 451 452
	}
}

453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469
/* Initialize ipmr pimreg/tunnel in_device */
static bool ipmr_init_vif_indev(const struct net_device *dev)
{
	struct in_device *in_dev;

	ASSERT_RTNL();

	in_dev = __in_dev_get_rtnl(dev);
	if (!in_dev)
		return false;
	ipv4_devconf_setall(in_dev);
	neigh_parms_data_state_setall(in_dev->arp_parms);
	IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;

	return true;
}

470
static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
L
Linus Torvalds 已提交
471 472 473
{
	struct net_device  *dev;

474
	dev = __dev_get_by_name(net, "tunl0");
L
Linus Torvalds 已提交
475 476

	if (dev) {
477
		const struct net_device_ops *ops = dev->netdev_ops;
L
Linus Torvalds 已提交
478 479 480 481 482 483 484 485 486 487 488
		int err;
		struct ifreq ifr;
		struct ip_tunnel_parm p;

		memset(&p, 0, sizeof(p));
		p.iph.daddr = v->vifc_rmt_addr.s_addr;
		p.iph.saddr = v->vifc_lcl_addr.s_addr;
		p.iph.version = 4;
		p.iph.ihl = 5;
		p.iph.protocol = IPPROTO_IPIP;
		sprintf(p.name, "dvmrp%d", v->vifc_vifi);
S
Stephen Hemminger 已提交
489
		ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
L
Linus Torvalds 已提交
490

491 492 493 494 495 496
		if (ops->ndo_do_ioctl) {
			mm_segment_t oldfs = get_fs();

			set_fs(KERNEL_DS);
			err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
			set_fs(oldfs);
E
Eric Dumazet 已提交
497
		} else {
498
			err = -EOPNOTSUPP;
E
Eric Dumazet 已提交
499
		}
L
Linus Torvalds 已提交
500 501
		dev = NULL;

502 503
		if (err == 0 &&
		    (dev = __dev_get_by_name(net, p.name)) != NULL) {
L
Linus Torvalds 已提交
504
			dev->flags |= IFF_MULTICAST;
505
			if (!ipmr_init_vif_indev(dev))
L
Linus Torvalds 已提交
506 507 508
				goto failure;
			if (dev_open(dev))
				goto failure;
509
			dev_hold(dev);
L
Linus Torvalds 已提交
510 511 512 513 514 515 516 517 518
		}
	}
	return dev;

failure:
	unregister_netdevice(dev);
	return NULL;
}

519
#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
520
static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
521
{
522
	struct net *net = dev_net(dev);
523
	struct mr_table *mrt;
D
David S. Miller 已提交
524 525
	struct flowi4 fl4 = {
		.flowi4_oif	= dev->ifindex,
526
		.flowi4_iif	= skb->skb_iif ? : LOOPBACK_IFINDEX,
D
David S. Miller 已提交
527
		.flowi4_mark	= skb->mark,
528 529 530
	};
	int err;

D
David S. Miller 已提交
531
	err = ipmr_fib_lookup(net, &fl4, &mrt);
532 533
	if (err < 0) {
		kfree_skb(skb);
534
		return err;
535
	}
536

L
Linus Torvalds 已提交
537
	read_lock(&mrt_lock);
538 539
	dev->stats.tx_bytes += skb->len;
	dev->stats.tx_packets++;
540
	ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
L
Linus Torvalds 已提交
541 542
	read_unlock(&mrt_lock);
	kfree_skb(skb);
543
	return NETDEV_TX_OK;
L
Linus Torvalds 已提交
544 545
}

546 547 548 549 550
static int reg_vif_get_iflink(const struct net_device *dev)
{
	return 0;
}

551 552
static const struct net_device_ops reg_vif_netdev_ops = {
	.ndo_start_xmit	= reg_vif_xmit,
553
	.ndo_get_iflink = reg_vif_get_iflink,
554 555
};

L
Linus Torvalds 已提交
556 557 558
static void reg_vif_setup(struct net_device *dev)
{
	dev->type		= ARPHRD_PIMREG;
559
	dev->mtu		= ETH_DATA_LEN - sizeof(struct iphdr) - 8;
L
Linus Torvalds 已提交
560
	dev->flags		= IFF_NOARP;
561
	dev->netdev_ops		= &reg_vif_netdev_ops;
562
	dev->needs_free_netdev	= true;
T
Tom Goff 已提交
563
	dev->features		|= NETIF_F_NETNS_LOCAL;
L
Linus Torvalds 已提交
564 565
}

566
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
L
Linus Torvalds 已提交
567 568
{
	struct net_device *dev;
569
	char name[IFNAMSIZ];
L
Linus Torvalds 已提交
570

571 572 573 574
	if (mrt->id == RT_TABLE_DEFAULT)
		sprintf(name, "pimreg");
	else
		sprintf(name, "pimreg%u", mrt->id);
L
Linus Torvalds 已提交
575

576
	dev = alloc_netdev(0, name, NET_NAME_UNKNOWN, reg_vif_setup);
L
Linus Torvalds 已提交
577

578
	if (!dev)
L
Linus Torvalds 已提交
579 580
		return NULL;

T
Tom Goff 已提交
581 582
	dev_net_set(dev, net);

L
Linus Torvalds 已提交
583 584 585 586 587
	if (register_netdevice(dev)) {
		free_netdev(dev);
		return NULL;
	}

588
	if (!ipmr_init_vif_indev(dev))
L
Linus Torvalds 已提交
589 590 591 592
		goto failure;
	if (dev_open(dev))
		goto failure;

593 594
	dev_hold(dev);

L
Linus Torvalds 已提交
595 596 597 598 599 600
	return dev;

failure:
	unregister_netdevice(dev);
	return NULL;
}
601 602 603 604 605 606 607 608 609

/* called with rcu_read_lock() */
static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
		     unsigned int pimlen)
{
	struct net_device *reg_dev = NULL;
	struct iphdr *encap;

	encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
610
	/* Check that:
611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644
	 * a. packet is really sent to a multicast group
	 * b. packet is not a NULL-REGISTER
	 * c. packet is not truncated
	 */
	if (!ipv4_is_multicast(encap->daddr) ||
	    encap->tot_len == 0 ||
	    ntohs(encap->tot_len) + pimlen > skb->len)
		return 1;

	read_lock(&mrt_lock);
	if (mrt->mroute_reg_vif_num >= 0)
		reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
	read_unlock(&mrt_lock);

	if (!reg_dev)
		return 1;

	skb->mac_header = skb->network_header;
	skb_pull(skb, (u8 *)encap - skb->data);
	skb_reset_network_header(skb);
	skb->protocol = htons(ETH_P_IP);
	skb->ip_summed = CHECKSUM_NONE;

	skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));

	netif_rx(skb);

	return NET_RX_SUCCESS;
}
#else
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
	return NULL;
}
L
Linus Torvalds 已提交
645 646
#endif

647 648 649 650 651 652
static int call_ipmr_vif_entry_notifier(struct notifier_block *nb,
					struct net *net,
					enum fib_event_type event_type,
					struct vif_device *vif,
					vifi_t vif_index, u32 tb_id)
{
653 654
	return mr_call_vif_notifier(nb, net, RTNL_FAMILY_IPMR, event_type,
				    vif, vif_index, tb_id);
655 656
}

657 658 659 660 661
static int call_ipmr_vif_entry_notifiers(struct net *net,
					 enum fib_event_type event_type,
					 struct vif_device *vif,
					 vifi_t vif_index, u32 tb_id)
{
662 663 664
	return mr_call_vif_notifiers(net, RTNL_FAMILY_IPMR, event_type,
				     vif, vif_index, tb_id,
				     &net->ipv4.ipmr_seq);
665 666
}

667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683
static int call_ipmr_mfc_entry_notifier(struct notifier_block *nb,
					struct net *net,
					enum fib_event_type event_type,
					struct mfc_cache *mfc, u32 tb_id)
{
	struct mfc_entry_notifier_info info = {
		.info = {
			.family = RTNL_FAMILY_IPMR,
			.net = net,
		},
		.mfc = mfc,
		.tb_id = tb_id
	};

	return call_fib_notifier(nb, net, event_type, &info.info);
}

684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701
static int call_ipmr_mfc_entry_notifiers(struct net *net,
					 enum fib_event_type event_type,
					 struct mfc_cache *mfc, u32 tb_id)
{
	struct mfc_entry_notifier_info info = {
		.info = {
			.family = RTNL_FAMILY_IPMR,
			.net = net,
		},
		.mfc = mfc,
		.tb_id = tb_id
	};

	ASSERT_RTNL();
	net->ipv4.ipmr_seq++;
	return call_fib_notifiers(net, event_type, &info.info);
}

702 703
/**
 *	vif_delete - Delete a VIF entry
704
 *	@notify: Set to 1, if the caller is a notifier_call
L
Linus Torvalds 已提交
705
 */
706
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
707
		      struct list_head *head)
L
Linus Torvalds 已提交
708
{
709
	struct net *net = read_pnet(&mrt->net);
L
Linus Torvalds 已提交
710 711 712 713
	struct vif_device *v;
	struct net_device *dev;
	struct in_device *in_dev;

714
	if (vifi < 0 || vifi >= mrt->maxvif)
L
Linus Torvalds 已提交
715 716
		return -EADDRNOTAVAIL;

717
	v = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
718

719 720 721 722
	if (VIF_EXISTS(mrt, vifi))
		call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_DEL, v, vifi,
					      mrt->id);

L
Linus Torvalds 已提交
723 724 725 726 727 728 729 730 731
	write_lock_bh(&mrt_lock);
	dev = v->dev;
	v->dev = NULL;

	if (!dev) {
		write_unlock_bh(&mrt_lock);
		return -EADDRNOTAVAIL;
	}

732 733
	if (vifi == mrt->mroute_reg_vif_num)
		mrt->mroute_reg_vif_num = -1;
L
Linus Torvalds 已提交
734

E
Eric Dumazet 已提交
735
	if (vifi + 1 == mrt->maxvif) {
L
Linus Torvalds 已提交
736
		int tmp;
E
Eric Dumazet 已提交
737 738

		for (tmp = vifi - 1; tmp >= 0; tmp--) {
739
			if (VIF_EXISTS(mrt, tmp))
L
Linus Torvalds 已提交
740 741
				break;
		}
742
		mrt->maxvif = tmp+1;
L
Linus Torvalds 已提交
743 744 745 746 747 748
	}

	write_unlock_bh(&mrt_lock);

	dev_set_allmulti(dev, -1);

E
Eric Dumazet 已提交
749 750
	in_dev = __in_dev_get_rtnl(dev);
	if (in_dev) {
751
		IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
752
		inet_netconf_notify_devconf(dev_net(dev), RTM_NEWNETCONF,
753 754
					    NETCONFA_MC_FORWARDING,
					    dev->ifindex, &in_dev->cnf);
L
Linus Torvalds 已提交
755 756 757
		ip_rt_multicast_event(in_dev);
	}

E
Eric Dumazet 已提交
758
	if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify)
759
		unregister_netdevice_queue(dev, head);
L
Linus Torvalds 已提交
760 761 762 763 764

	dev_put(dev);
	return 0;
}

765
static void ipmr_cache_free_rcu(struct rcu_head *head)
766
{
767
	struct mr_mfc *c = container_of(head, struct mr_mfc, rcu);
768

769
	kmem_cache_free(mrt_cachep, (struct mfc_cache *)c);
770 771
}

772
void ipmr_cache_free(struct mfc_cache *c)
773
{
774
	call_rcu(&c->_c.rcu, ipmr_cache_free_rcu);
775
}
776
EXPORT_SYMBOL(ipmr_cache_free);
777

L
Linus Torvalds 已提交
778
/* Destroy an unresolved cache entry, killing queued skbs
E
Eric Dumazet 已提交
779
 * and reporting error to netlink readers.
L
Linus Torvalds 已提交
780
 */
781
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
L
Linus Torvalds 已提交
782
{
783
	struct net *net = read_pnet(&mrt->net);
L
Linus Torvalds 已提交
784
	struct sk_buff *skb;
785
	struct nlmsgerr *e;
L
Linus Torvalds 已提交
786

787
	atomic_dec(&mrt->cache_resolve_queue_len);
L
Linus Torvalds 已提交
788

789
	while ((skb = skb_dequeue(&c->_c.mfc_un.unres.unresolved))) {
790
		if (ip_hdr(skb)->version == 0) {
791 792
			struct nlmsghdr *nlh = skb_pull(skb,
							sizeof(struct iphdr));
L
Linus Torvalds 已提交
793
			nlh->nlmsg_type = NLMSG_ERROR;
794
			nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
L
Linus Torvalds 已提交
795
			skb_trim(skb, nlh->nlmsg_len);
796
			e = nlmsg_data(nlh);
797 798
			e->error = -ETIMEDOUT;
			memset(&e->msg, 0, sizeof(e->msg));
799

800
			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
E
Eric Dumazet 已提交
801
		} else {
L
Linus Torvalds 已提交
802
			kfree_skb(skb);
E
Eric Dumazet 已提交
803
		}
L
Linus Torvalds 已提交
804 805
	}

806
	ipmr_cache_free(c);
L
Linus Torvalds 已提交
807 808
}

809
/* Timer process for the unresolved queue. */
810
static void ipmr_expire_process(struct timer_list *t)
L
Linus Torvalds 已提交
811
{
812
	struct mr_table *mrt = from_timer(mrt, t, ipmr_expire_timer);
813
	struct mr_mfc *c, *next;
L
Linus Torvalds 已提交
814
	unsigned long expires;
815
	unsigned long now;
L
Linus Torvalds 已提交
816 817

	if (!spin_trylock(&mfc_unres_lock)) {
818
		mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
L
Linus Torvalds 已提交
819 820 821
		return;
	}

822
	if (list_empty(&mrt->mfc_unres_queue))
L
Linus Torvalds 已提交
823 824 825 826 827
		goto out;

	now = jiffies;
	expires = 10*HZ;

828
	list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
L
Linus Torvalds 已提交
829 830 831 832 833 834 835
		if (time_after(c->mfc_un.unres.expires, now)) {
			unsigned long interval = c->mfc_un.unres.expires - now;
			if (interval < expires)
				expires = interval;
			continue;
		}

836
		list_del(&c->list);
837 838
		mroute_netlink_event(mrt, (struct mfc_cache *)c, RTM_DELROUTE);
		ipmr_destroy_unres(mrt, (struct mfc_cache *)c);
L
Linus Torvalds 已提交
839 840
	}

841 842
	if (!list_empty(&mrt->mfc_unres_queue))
		mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
L
Linus Torvalds 已提交
843 844 845 846 847 848

out:
	spin_unlock(&mfc_unres_lock);
}

/* Fill oifs list. It is called under write locked mrt_lock. */
849
static void ipmr_update_thresholds(struct mr_table *mrt, struct mr_mfc *cache,
850
				   unsigned char *ttls)
L
Linus Torvalds 已提交
851 852 853 854 855 856 857
{
	int vifi;

	cache->mfc_un.res.minvif = MAXVIFS;
	cache->mfc_un.res.maxvif = 0;
	memset(cache->mfc_un.res.ttls, 255, MAXVIFS);

858 859
	for (vifi = 0; vifi < mrt->maxvif; vifi++) {
		if (VIF_EXISTS(mrt, vifi) &&
860
		    ttls[vifi] && ttls[vifi] < 255) {
L
Linus Torvalds 已提交
861 862 863 864 865 866 867
			cache->mfc_un.res.ttls[vifi] = ttls[vifi];
			if (cache->mfc_un.res.minvif > vifi)
				cache->mfc_un.res.minvif = vifi;
			if (cache->mfc_un.res.maxvif <= vifi)
				cache->mfc_un.res.maxvif = vifi + 1;
		}
	}
868
	cache->mfc_un.res.lastuse = jiffies;
L
Linus Torvalds 已提交
869 870
}

871 872
static int vif_add(struct net *net, struct mr_table *mrt,
		   struct vifctl *vifc, int mrtsock)
L
Linus Torvalds 已提交
873 874
{
	int vifi = vifc->vifc_vifi;
875 876 877
	struct switchdev_attr attr = {
		.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
	};
878
	struct vif_device *v = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
879 880
	struct net_device *dev;
	struct in_device *in_dev;
881
	int err;
L
Linus Torvalds 已提交
882 883

	/* Is vif busy ? */
884
	if (VIF_EXISTS(mrt, vifi))
L
Linus Torvalds 已提交
885 886 887 888
		return -EADDRINUSE;

	switch (vifc->vifc_flags) {
	case VIFF_REGISTER:
889
		if (!ipmr_pimsm_enabled())
890 891
			return -EINVAL;
		/* Special Purpose VIF in PIM
L
Linus Torvalds 已提交
892 893
		 * All the packets will be sent to the daemon
		 */
894
		if (mrt->mroute_reg_vif_num >= 0)
L
Linus Torvalds 已提交
895
			return -EADDRINUSE;
896
		dev = ipmr_reg_vif(net, mrt);
L
Linus Torvalds 已提交
897 898
		if (!dev)
			return -ENOBUFS;
899 900 901
		err = dev_set_allmulti(dev, 1);
		if (err) {
			unregister_netdevice(dev);
902
			dev_put(dev);
903 904
			return err;
		}
L
Linus Torvalds 已提交
905
		break;
906
	case VIFF_TUNNEL:
907
		dev = ipmr_new_tunnel(net, vifc);
L
Linus Torvalds 已提交
908 909
		if (!dev)
			return -ENOBUFS;
910 911 912
		err = dev_set_allmulti(dev, 1);
		if (err) {
			ipmr_del_tunnel(dev, vifc);
913
			dev_put(dev);
914 915
			return err;
		}
L
Linus Torvalds 已提交
916
		break;
917
	case VIFF_USE_IFINDEX:
L
Linus Torvalds 已提交
918
	case 0:
919 920
		if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
			dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
921
			if (dev && !__in_dev_get_rtnl(dev)) {
922 923 924
				dev_put(dev);
				return -EADDRNOTAVAIL;
			}
E
Eric Dumazet 已提交
925
		} else {
926
			dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
E
Eric Dumazet 已提交
927
		}
L
Linus Torvalds 已提交
928 929
		if (!dev)
			return -EADDRNOTAVAIL;
930
		err = dev_set_allmulti(dev, 1);
931 932
		if (err) {
			dev_put(dev);
933
			return err;
934
		}
L
Linus Torvalds 已提交
935 936 937 938 939
		break;
	default:
		return -EINVAL;
	}

E
Eric Dumazet 已提交
940 941
	in_dev = __in_dev_get_rtnl(dev);
	if (!in_dev) {
942
		dev_put(dev);
L
Linus Torvalds 已提交
943
		return -EADDRNOTAVAIL;
944
	}
945
	IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
946 947
	inet_netconf_notify_devconf(net, RTM_NEWNETCONF, NETCONFA_MC_FORWARDING,
				    dev->ifindex, &in_dev->cnf);
L
Linus Torvalds 已提交
948 949
	ip_rt_multicast_event(in_dev);

E
Eric Dumazet 已提交
950
	/* Fill in the VIF structures */
951 952 953 954
	vif_device_init(v, dev, vifc->vifc_rate_limit,
			vifc->vifc_threshold,
			vifc->vifc_flags | (!mrtsock ? VIFF_STATIC : 0),
			(VIFF_TUNNEL | VIFF_REGISTER));
E
Eric Dumazet 已提交
955

956 957 958 959 960 961 962
	attr.orig_dev = dev;
	if (!switchdev_port_attr_get(dev, &attr)) {
		memcpy(v->dev_parent_id.id, attr.u.ppid.id, attr.u.ppid.id_len);
		v->dev_parent_id.id_len = attr.u.ppid.id_len;
	} else {
		v->dev_parent_id.id_len = 0;
	}
963

J
Jianjun Kong 已提交
964 965
	v->local = vifc->vifc_lcl_addr.s_addr;
	v->remote = vifc->vifc_rmt_addr.s_addr;
L
Linus Torvalds 已提交
966 967 968

	/* And finish update writing critical data */
	write_lock_bh(&mrt_lock);
J
Jianjun Kong 已提交
969
	v->dev = dev;
E
Eric Dumazet 已提交
970
	if (v->flags & VIFF_REGISTER)
971 972 973
		mrt->mroute_reg_vif_num = vifi;
	if (vifi+1 > mrt->maxvif)
		mrt->maxvif = vifi+1;
L
Linus Torvalds 已提交
974
	write_unlock_bh(&mrt_lock);
975
	call_ipmr_vif_entry_notifiers(net, FIB_EVENT_VIF_ADD, v, vifi, mrt->id);
L
Linus Torvalds 已提交
976 977 978
	return 0;
}

979
/* called with rcu_read_lock() */
980
static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
981 982
					 __be32 origin,
					 __be32 mcastgrp)
L
Linus Torvalds 已提交
983
{
984 985 986 987
	struct mfc_cache_cmp_arg arg = {
			.mfc_mcastgrp = mcastgrp,
			.mfc_origin = origin
	};
988

989
	return mr_mfc_find(mrt, &arg);
990 991 992 993 994 995
}

/* Look for a (*,G) entry */
static struct mfc_cache *ipmr_cache_find_any(struct mr_table *mrt,
					     __be32 mcastgrp, int vifi)
{
996 997 998 999
	struct mfc_cache_cmp_arg arg = {
			.mfc_mcastgrp = mcastgrp,
			.mfc_origin = htonl(INADDR_ANY)
	};
1000

1001
	if (mcastgrp == htonl(INADDR_ANY))
1002 1003
		return mr_mfc_find_any_parent(mrt, vifi);
	return mr_mfc_find_any(mrt, vifi, &arg);
1004 1005
}

1006 1007 1008 1009 1010 1011 1012 1013 1014 1015
/* Look for a (S,G,iif) entry if parent != -1 */
static struct mfc_cache *ipmr_cache_find_parent(struct mr_table *mrt,
						__be32 origin, __be32 mcastgrp,
						int parent)
{
	struct mfc_cache_cmp_arg arg = {
			.mfc_mcastgrp = mcastgrp,
			.mfc_origin = origin,
	};

1016
	return mr_mfc_find_parent(mrt, &arg, parent);
1017 1018
}

1019
/* Allocate a multicast cache entry */
1020
static struct mfc_cache *ipmr_cache_alloc(void)
L
Linus Torvalds 已提交
1021
{
J
Jianjun Kong 已提交
1022
	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
1023

1024
	if (c) {
1025 1026 1027
		c->_c.mfc_un.res.last_assert = jiffies - MFC_ASSERT_THRESH - 1;
		c->_c.mfc_un.res.minvif = MAXVIFS;
		refcount_set(&c->_c.mfc_un.res.refcount, 1);
1028
	}
L
Linus Torvalds 已提交
1029 1030 1031
	return c;
}

1032
static struct mfc_cache *ipmr_cache_alloc_unres(void)
L
Linus Torvalds 已提交
1033
{
J
Jianjun Kong 已提交
1034
	struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
1035 1036

	if (c) {
1037 1038
		skb_queue_head_init(&c->_c.mfc_un.unres.unresolved);
		c->_c.mfc_un.unres.expires = jiffies + 10 * HZ;
1039
	}
L
Linus Torvalds 已提交
1040 1041 1042
	return c;
}

1043
/* A cache entry has gone into a resolved state from queued */
1044 1045
static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
			       struct mfc_cache *uc, struct mfc_cache *c)
L
Linus Torvalds 已提交
1046 1047
{
	struct sk_buff *skb;
1048
	struct nlmsgerr *e;
L
Linus Torvalds 已提交
1049

E
Eric Dumazet 已提交
1050
	/* Play the pending entries through our router */
1051
	while ((skb = __skb_dequeue(&uc->_c.mfc_un.unres.unresolved))) {
1052
		if (ip_hdr(skb)->version == 0) {
1053 1054
			struct nlmsghdr *nlh = skb_pull(skb,
							sizeof(struct iphdr));
L
Linus Torvalds 已提交
1055

Y
Yuval Mintz 已提交
1056 1057
			if (mr_fill_mroute(mrt, skb, &c->_c,
					   nlmsg_data(nlh)) > 0) {
E
Eric Dumazet 已提交
1058 1059
				nlh->nlmsg_len = skb_tail_pointer(skb) -
						 (u8 *)nlh;
L
Linus Torvalds 已提交
1060 1061
			} else {
				nlh->nlmsg_type = NLMSG_ERROR;
1062
				nlh->nlmsg_len = nlmsg_msg_size(sizeof(struct nlmsgerr));
L
Linus Torvalds 已提交
1063
				skb_trim(skb, nlh->nlmsg_len);
1064
				e = nlmsg_data(nlh);
1065 1066
				e->error = -EMSGSIZE;
				memset(&e->msg, 0, sizeof(e->msg));
L
Linus Torvalds 已提交
1067
			}
1068

1069
			rtnl_unicast(skb, net, NETLINK_CB(skb).portid);
E
Eric Dumazet 已提交
1070
		} else {
1071
			ip_mr_forward(net, mrt, skb->dev, skb, c, 0);
E
Eric Dumazet 已提交
1072
		}
L
Linus Torvalds 已提交
1073 1074 1075
	}
}

1076
/* Bounce a cache query up to mrouted and netlink.
L
Linus Torvalds 已提交
1077
 *
1078
 * Called under mrt_lock.
L
Linus Torvalds 已提交
1079
 */
1080
static int ipmr_cache_report(struct mr_table *mrt,
1081
			     struct sk_buff *pkt, vifi_t vifi, int assert)
L
Linus Torvalds 已提交
1082
{
1083
	const int ihl = ip_hdrlen(pkt);
1084
	struct sock *mroute_sk;
L
Linus Torvalds 已提交
1085 1086
	struct igmphdr *igmp;
	struct igmpmsg *msg;
1087
	struct sk_buff *skb;
L
Linus Torvalds 已提交
1088 1089 1090 1091 1092 1093 1094
	int ret;

	if (assert == IGMPMSG_WHOLEPKT)
		skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
	else
		skb = alloc_skb(128, GFP_ATOMIC);

S
Stephen Hemminger 已提交
1095
	if (!skb)
L
Linus Torvalds 已提交
1096 1097 1098 1099
		return -ENOBUFS;

	if (assert == IGMPMSG_WHOLEPKT) {
		/* Ugly, but we have no choice with this interface.
E
Eric Dumazet 已提交
1100 1101 1102
		 * Duplicate old header, fix ihl, length etc.
		 * And all this only to mangle msg->im_msgtype and
		 * to set msg->im_mbz to "mbz" :-)
L
Linus Torvalds 已提交
1103
		 */
1104 1105
		skb_push(skb, sizeof(struct iphdr));
		skb_reset_network_header(skb);
1106
		skb_reset_transport_header(skb);
1107
		msg = (struct igmpmsg *)skb_network_header(skb);
1108
		memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
L
Linus Torvalds 已提交
1109 1110
		msg->im_msgtype = IGMPMSG_WHOLEPKT;
		msg->im_mbz = 0;
1111
		msg->im_vif = mrt->mroute_reg_vif_num;
1112 1113 1114
		ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
		ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
					     sizeof(struct iphdr));
1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125
	} else {
		/* Copy the IP header */
		skb_set_network_header(skb, skb->len);
		skb_put(skb, ihl);
		skb_copy_to_linear_data(skb, pkt->data, ihl);
		/* Flag to the kernel this is a route add */
		ip_hdr(skb)->protocol = 0;
		msg = (struct igmpmsg *)skb_network_header(skb);
		msg->im_vif = vifi;
		skb_dst_set(skb, dst_clone(skb_dst(pkt)));
		/* Add our header */
1126
		igmp = skb_put(skb, sizeof(struct igmphdr));
1127 1128 1129 1130 1131
		igmp->type = assert;
		msg->im_msgtype = assert;
		igmp->code = 0;
		ip_hdr(skb)->tot_len = htons(skb->len);	/* Fix the length */
		skb->transport_header = skb->network_header;
1132
	}
L
Linus Torvalds 已提交
1133

E
Eric Dumazet 已提交
1134 1135
	rcu_read_lock();
	mroute_sk = rcu_dereference(mrt->mroute_sk);
1136
	if (!mroute_sk) {
E
Eric Dumazet 已提交
1137
		rcu_read_unlock();
L
Linus Torvalds 已提交
1138 1139 1140 1141
		kfree_skb(skb);
		return -EINVAL;
	}

1142 1143
	igmpmsg_netlink_event(mrt, skb);

E
Eric Dumazet 已提交
1144
	/* Deliver to mrouted */
E
Eric Dumazet 已提交
1145 1146
	ret = sock_queue_rcv_skb(mroute_sk, skb);
	rcu_read_unlock();
1147
	if (ret < 0) {
1148
		net_warn_ratelimited("mroute: pending queue full, dropping entries\n");
L
Linus Torvalds 已提交
1149 1150 1151 1152 1153 1154
		kfree_skb(skb);
	}

	return ret;
}

1155 1156
/* Queue a packet for resolution. It gets locked cache entry! */
static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
1157
				 struct sk_buff *skb, struct net_device *dev)
L
Linus Torvalds 已提交
1158
{
1159 1160
	const struct iphdr *iph = ip_hdr(skb);
	struct mfc_cache *c;
1161
	bool found = false;
L
Linus Torvalds 已提交
1162 1163 1164
	int err;

	spin_lock_bh(&mfc_unres_lock);
1165
	list_for_each_entry(c, &mrt->mfc_unres_queue, _c.list) {
1166
		if (c->mfc_mcastgrp == iph->daddr &&
1167 1168
		    c->mfc_origin == iph->saddr) {
			found = true;
L
Linus Torvalds 已提交
1169
			break;
1170
		}
L
Linus Torvalds 已提交
1171 1172
	}

1173
	if (!found) {
E
Eric Dumazet 已提交
1174
		/* Create a new entry if allowable */
1175
		if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
1176
		    (c = ipmr_cache_alloc_unres()) == NULL) {
L
Linus Torvalds 已提交
1177 1178 1179 1180 1181 1182
			spin_unlock_bh(&mfc_unres_lock);

			kfree_skb(skb);
			return -ENOBUFS;
		}

E
Eric Dumazet 已提交
1183
		/* Fill in the new cache entry */
1184
		c->_c.mfc_parent = -1;
1185 1186
		c->mfc_origin	= iph->saddr;
		c->mfc_mcastgrp	= iph->daddr;
L
Linus Torvalds 已提交
1187

E
Eric Dumazet 已提交
1188
		/* Reflect first query at mrouted. */
1189
		err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
1190

1191
		if (err < 0) {
1192
			/* If the report failed throw the cache entry
L
Linus Torvalds 已提交
1193 1194 1195 1196
			   out - Brad Parker
			 */
			spin_unlock_bh(&mfc_unres_lock);

1197
			ipmr_cache_free(c);
L
Linus Torvalds 已提交
1198 1199 1200 1201
			kfree_skb(skb);
			return err;
		}

1202
		atomic_inc(&mrt->cache_resolve_queue_len);
1203
		list_add(&c->_c.list, &mrt->mfc_unres_queue);
1204
		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1205

1206
		if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1207 1208
			mod_timer(&mrt->ipmr_expire_timer,
				  c->_c.mfc_un.unres.expires);
L
Linus Torvalds 已提交
1209 1210
	}

E
Eric Dumazet 已提交
1211
	/* See if we can append the packet */
1212
	if (c->_c.mfc_un.unres.unresolved.qlen > 3) {
L
Linus Torvalds 已提交
1213 1214 1215
		kfree_skb(skb);
		err = -ENOBUFS;
	} else {
1216 1217 1218 1219
		if (dev) {
			skb->dev = dev;
			skb->skb_iif = dev->ifindex;
		}
1220
		skb_queue_tail(&c->_c.mfc_un.unres.unresolved, skb);
L
Linus Torvalds 已提交
1221 1222 1223 1224 1225 1226 1227
		err = 0;
	}

	spin_unlock_bh(&mfc_unres_lock);
	return err;
}

1228
/* MFC cache manipulation by user space mroute daemon */
L
Linus Torvalds 已提交
1229

1230
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
L
Linus Torvalds 已提交
1231
{
1232
	struct net *net = read_pnet(&mrt->net);
1233
	struct mfc_cache *c;
L
Linus Torvalds 已提交
1234

1235 1236 1237 1238 1239 1240 1241
	/* The entries are added/deleted only under RTNL */
	rcu_read_lock();
	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
				   mfc->mfcc_mcastgrp.s_addr, parent);
	rcu_read_unlock();
	if (!c)
		return -ENOENT;
1242 1243
	rhltable_remove(&mrt->mfc_hash, &c->_c.mnode, ipmr_rht_params);
	list_del_rcu(&c->_c.list);
1244
	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, c, mrt->id);
1245
	mroute_netlink_event(mrt, c, RTM_DELROUTE);
1246
	ipmr_cache_put(c);
L
Linus Torvalds 已提交
1247

1248
	return 0;
L
Linus Torvalds 已提交
1249 1250
}

1251
static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1252
			struct mfcctl *mfc, int mrtsock, int parent)
L
Linus Torvalds 已提交
1253
{
1254
	struct mfc_cache *uc, *c;
1255
	struct mr_mfc *_uc;
1256 1257
	bool found;
	int ret;
L
Linus Torvalds 已提交
1258

1259 1260 1261
	if (mfc->mfcc_parent >= MAXVIFS)
		return -ENFILE;

1262 1263 1264 1265 1266 1267
	/* The entries are added/deleted only under RTNL */
	rcu_read_lock();
	c = ipmr_cache_find_parent(mrt, mfc->mfcc_origin.s_addr,
				   mfc->mfcc_mcastgrp.s_addr, parent);
	rcu_read_unlock();
	if (c) {
L
Linus Torvalds 已提交
1268
		write_lock_bh(&mrt_lock);
1269 1270
		c->_c.mfc_parent = mfc->mfcc_parent;
		ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
L
Linus Torvalds 已提交
1271
		if (!mrtsock)
1272
			c->_c.mfc_flags |= MFC_STATIC;
L
Linus Torvalds 已提交
1273
		write_unlock_bh(&mrt_lock);
1274 1275
		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_REPLACE, c,
					      mrt->id);
1276
		mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1277 1278 1279
		return 0;
	}

1280
	if (mfc->mfcc_mcastgrp.s_addr != htonl(INADDR_ANY) &&
1281
	    !ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
L
Linus Torvalds 已提交
1282 1283
		return -EINVAL;

1284
	c = ipmr_cache_alloc();
1285
	if (!c)
L
Linus Torvalds 已提交
1286 1287
		return -ENOMEM;

J
Jianjun Kong 已提交
1288 1289
	c->mfc_origin = mfc->mfcc_origin.s_addr;
	c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1290 1291
	c->_c.mfc_parent = mfc->mfcc_parent;
	ipmr_update_thresholds(mrt, &c->_c, mfc->mfcc_ttls);
L
Linus Torvalds 已提交
1292
	if (!mrtsock)
1293
		c->_c.mfc_flags |= MFC_STATIC;
L
Linus Torvalds 已提交
1294

1295
	ret = rhltable_insert_key(&mrt->mfc_hash, &c->cmparg, &c->_c.mnode,
1296 1297 1298 1299 1300 1301
				  ipmr_rht_params);
	if (ret) {
		pr_err("ipmr: rhtable insert error %d\n", ret);
		ipmr_cache_free(c);
		return ret;
	}
1302
	list_add_tail_rcu(&c->_c.list, &mrt->mfc_cache_list);
1303 1304
	/* Check to see if we resolved a queued list. If so we
	 * need to send on the frames and tidy up.
L
Linus Torvalds 已提交
1305
	 */
1306
	found = false;
L
Linus Torvalds 已提交
1307
	spin_lock_bh(&mfc_unres_lock);
1308 1309
	list_for_each_entry(_uc, &mrt->mfc_unres_queue, list) {
		uc = (struct mfc_cache *)_uc;
1310
		if (uc->mfc_origin == c->mfc_origin &&
L
Linus Torvalds 已提交
1311
		    uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1312
			list_del(&_uc->list);
1313
			atomic_dec(&mrt->cache_resolve_queue_len);
1314
			found = true;
L
Linus Torvalds 已提交
1315 1316 1317
			break;
		}
	}
1318 1319
	if (list_empty(&mrt->mfc_unres_queue))
		del_timer(&mrt->ipmr_expire_timer);
L
Linus Torvalds 已提交
1320 1321
	spin_unlock_bh(&mfc_unres_lock);

1322
	if (found) {
1323
		ipmr_cache_resolve(net, mrt, uc, c);
1324
		ipmr_cache_free(uc);
L
Linus Torvalds 已提交
1325
	}
1326
	call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_ADD, c, mrt->id);
1327
	mroute_netlink_event(mrt, c, RTM_NEWROUTE);
L
Linus Torvalds 已提交
1328 1329 1330
	return 0;
}

1331
/* Close the multicast socket, and clear the vif tables etc */
1332
static void mroute_clean_tables(struct mr_table *mrt, bool all)
L
Linus Torvalds 已提交
1333
{
1334
	struct net *net = read_pnet(&mrt->net);
1335 1336
	struct mr_mfc *c, *tmp;
	struct mfc_cache *cache;
1337
	LIST_HEAD(list);
1338
	int i;
1339

E
Eric Dumazet 已提交
1340
	/* Shut down all active vif entries */
1341
	for (i = 0; i < mrt->maxvif; i++) {
1342 1343 1344
		if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
			continue;
		vif_delete(mrt, i, 0, &list);
L
Linus Torvalds 已提交
1345
	}
1346
	unregister_netdevice_many(&list);
L
Linus Torvalds 已提交
1347

E
Eric Dumazet 已提交
1348
	/* Wipe the cache */
1349 1350 1351 1352 1353
	list_for_each_entry_safe(c, tmp, &mrt->mfc_cache_list, list) {
		if (!all && (c->mfc_flags & MFC_STATIC))
			continue;
		rhltable_remove(&mrt->mfc_hash, &c->mnode, ipmr_rht_params);
		list_del_rcu(&c->list);
1354 1355
		cache = (struct mfc_cache *)c;
		call_ipmr_mfc_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, cache,
1356
					      mrt->id);
1357 1358
		mroute_netlink_event(mrt, cache, RTM_DELROUTE);
		ipmr_cache_put(cache);
L
Linus Torvalds 已提交
1359 1360
	}

1361
	if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
L
Linus Torvalds 已提交
1362
		spin_lock_bh(&mfc_unres_lock);
1363
		list_for_each_entry_safe(c, tmp, &mrt->mfc_unres_queue, list) {
1364
			list_del(&c->list);
1365 1366 1367
			cache = (struct mfc_cache *)c;
			mroute_netlink_event(mrt, cache, RTM_DELROUTE);
			ipmr_destroy_unres(mrt, cache);
L
Linus Torvalds 已提交
1368 1369 1370 1371 1372
		}
		spin_unlock_bh(&mfc_unres_lock);
	}
}

E
Eric Dumazet 已提交
1373 1374 1375
/* called from ip_ra_control(), before an RCU grace period,
 * we dont need to call synchronize_rcu() here
 */
L
Linus Torvalds 已提交
1376 1377
static void mrtsock_destruct(struct sock *sk)
{
1378
	struct net *net = sock_net(sk);
1379
	struct mr_table *mrt;
1380

1381
	rtnl_lock();
1382
	ipmr_for_each_table(mrt, net) {
E
Eric Dumazet 已提交
1383
		if (sk == rtnl_dereference(mrt->mroute_sk)) {
1384
			IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1385 1386
			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
						    NETCONFA_MC_FORWARDING,
1387 1388
						    NETCONFA_IFINDEX_ALL,
						    net->ipv4.devconf_all);
1389
			RCU_INIT_POINTER(mrt->mroute_sk, NULL);
1390
			mroute_clean_tables(mrt, false);
1391
		}
L
Linus Torvalds 已提交
1392
	}
1393
	rtnl_unlock();
L
Linus Torvalds 已提交
1394 1395
}

1396 1397 1398 1399
/* Socket options and virtual interface manipulation. The whole
 * virtual interface system is a complete heap, but unfortunately
 * that's how BSD mrouted happens to think. Maybe one day with a proper
 * MOSPF/PIM router set up we can clean this up.
L
Linus Torvalds 已提交
1400
 */
1401

1402 1403
int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
			 unsigned int optlen)
L
Linus Torvalds 已提交
1404
{
1405
	struct net *net = sock_net(sk);
1406
	int val, ret = 0, parent = 0;
1407
	struct mr_table *mrt;
1408 1409 1410
	struct vifctl vif;
	struct mfcctl mfc;
	u32 uval;
1411

1412 1413
	/* There's one exception to the lock - MRT_DONE which needs to unlock */
	rtnl_lock();
1414
	if (sk->sk_type != SOCK_RAW ||
1415 1416 1417 1418
	    inet_sk(sk)->inet_num != IPPROTO_IGMP) {
		ret = -EOPNOTSUPP;
		goto out_unlock;
	}
1419

1420
	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1421 1422 1423 1424
	if (!mrt) {
		ret = -ENOENT;
		goto out_unlock;
	}
S
Stephen Hemminger 已提交
1425
	if (optname != MRT_INIT) {
1426
		if (sk != rcu_access_pointer(mrt->mroute_sk) &&
1427 1428 1429 1430
		    !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
			ret = -EACCES;
			goto out_unlock;
		}
L
Linus Torvalds 已提交
1431 1432
	}

S
Stephen Hemminger 已提交
1433 1434
	switch (optname) {
	case MRT_INIT:
1435
		if (optlen != sizeof(int)) {
1436
			ret = -EINVAL;
1437 1438 1439
			break;
		}
		if (rtnl_dereference(mrt->mroute_sk)) {
1440 1441
			ret = -EADDRINUSE;
			break;
1442
		}
S
Stephen Hemminger 已提交
1443 1444 1445

		ret = ip_ra_control(sk, 1, mrtsock_destruct);
		if (ret == 0) {
1446
			rcu_assign_pointer(mrt->mroute_sk, sk);
1447
			IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1448 1449
			inet_netconf_notify_devconf(net, RTM_NEWNETCONF,
						    NETCONFA_MC_FORWARDING,
1450 1451
						    NETCONFA_IFINDEX_ALL,
						    net->ipv4.devconf_all);
S
Stephen Hemminger 已提交
1452
		}
1453
		break;
S
Stephen Hemminger 已提交
1454
	case MRT_DONE:
1455 1456 1457
		if (sk != rcu_access_pointer(mrt->mroute_sk)) {
			ret = -EACCES;
		} else {
1458 1459 1460 1461 1462
			/* We need to unlock here because mrtsock_destruct takes
			 * care of rtnl itself and we can't change that due to
			 * the IP_ROUTER_ALERT setsockopt which runs without it.
			 */
			rtnl_unlock();
1463
			ret = ip_ra_control(sk, 0, NULL);
1464
			goto out;
1465 1466
		}
		break;
S
Stephen Hemminger 已提交
1467 1468
	case MRT_ADD_VIF:
	case MRT_DEL_VIF:
1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480
		if (optlen != sizeof(vif)) {
			ret = -EINVAL;
			break;
		}
		if (copy_from_user(&vif, optval, sizeof(vif))) {
			ret = -EFAULT;
			break;
		}
		if (vif.vifc_vifi >= MAXVIFS) {
			ret = -ENFILE;
			break;
		}
J
Jianjun Kong 已提交
1481
		if (optname == MRT_ADD_VIF) {
E
Eric Dumazet 已提交
1482 1483
			ret = vif_add(net, mrt, &vif,
				      sk == rtnl_dereference(mrt->mroute_sk));
S
Stephen Hemminger 已提交
1484
		} else {
1485
			ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
S
Stephen Hemminger 已提交
1486
		}
1487
		break;
1488 1489 1490
	/* Manipulate the forwarding caches. These live
	 * in a sort of kernel/user symbiosis.
	 */
S
Stephen Hemminger 已提交
1491 1492
	case MRT_ADD_MFC:
	case MRT_DEL_MFC:
1493
		parent = -1;
1494
		/* fall through */
1495 1496
	case MRT_ADD_MFC_PROXY:
	case MRT_DEL_MFC_PROXY:
1497 1498 1499 1500 1501 1502 1503 1504
		if (optlen != sizeof(mfc)) {
			ret = -EINVAL;
			break;
		}
		if (copy_from_user(&mfc, optval, sizeof(mfc))) {
			ret = -EFAULT;
			break;
		}
1505 1506 1507 1508
		if (parent == 0)
			parent = mfc.mfcc_parent;
		if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
			ret = ipmr_mfc_delete(mrt, &mfc, parent);
S
Stephen Hemminger 已提交
1509
		else
E
Eric Dumazet 已提交
1510
			ret = ipmr_mfc_add(net, mrt, &mfc,
1511 1512
					   sk == rtnl_dereference(mrt->mroute_sk),
					   parent);
1513
		break;
1514
	/* Control PIM assert. */
S
Stephen Hemminger 已提交
1515
	case MRT_ASSERT:
1516 1517 1518 1519 1520 1521 1522 1523 1524 1525
		if (optlen != sizeof(val)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(val, (int __user *)optval)) {
			ret = -EFAULT;
			break;
		}
		mrt->mroute_do_assert = val;
		break;
S
Stephen Hemminger 已提交
1526
	case MRT_PIM:
1527
		if (!ipmr_pimsm_enabled()) {
1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538
			ret = -ENOPROTOOPT;
			break;
		}
		if (optlen != sizeof(val)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(val, (int __user *)optval)) {
			ret = -EFAULT;
			break;
		}
S
Stephen Hemminger 已提交
1539

1540 1541 1542 1543
		val = !!val;
		if (val != mrt->mroute_do_pim) {
			mrt->mroute_do_pim = val;
			mrt->mroute_do_assert = val;
L
Linus Torvalds 已提交
1544
		}
1545
		break;
1546
	case MRT_TABLE:
1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558
		if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
			ret = -ENOPROTOOPT;
			break;
		}
		if (optlen != sizeof(uval)) {
			ret = -EINVAL;
			break;
		}
		if (get_user(uval, (u32 __user *)optval)) {
			ret = -EFAULT;
			break;
		}
1559

E
Eric Dumazet 已提交
1560 1561 1562
		if (sk == rtnl_dereference(mrt->mroute_sk)) {
			ret = -EBUSY;
		} else {
1563
			mrt = ipmr_new_table(net, uval);
1564 1565
			if (IS_ERR(mrt))
				ret = PTR_ERR(mrt);
1566
			else
1567
				raw_sk(sk)->ipmr_table = uval;
E
Eric Dumazet 已提交
1568
		}
1569
		break;
1570
	/* Spurious command, or MRT_VERSION which you cannot set. */
S
Stephen Hemminger 已提交
1571
	default:
1572
		ret = -ENOPROTOOPT;
L
Linus Torvalds 已提交
1573
	}
1574 1575
out_unlock:
	rtnl_unlock();
1576
out:
1577
	return ret;
L
Linus Torvalds 已提交
1578 1579
}

1580
/* Getsock opt support for the multicast routing system. */
J
Jianjun Kong 已提交
1581
int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
L
Linus Torvalds 已提交
1582 1583 1584
{
	int olr;
	int val;
1585
	struct net *net = sock_net(sk);
1586 1587
	struct mr_table *mrt;

1588 1589 1590 1591
	if (sk->sk_type != SOCK_RAW ||
	    inet_sk(sk)->inet_num != IPPROTO_IGMP)
		return -EOPNOTSUPP;

1592
	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1593
	if (!mrt)
1594
		return -ENOENT;
L
Linus Torvalds 已提交
1595

1596 1597 1598 1599 1600
	switch (optname) {
	case MRT_VERSION:
		val = 0x0305;
		break;
	case MRT_PIM:
1601
		if (!ipmr_pimsm_enabled())
1602 1603 1604 1605 1606 1607 1608
			return -ENOPROTOOPT;
		val = mrt->mroute_do_pim;
		break;
	case MRT_ASSERT:
		val = mrt->mroute_do_assert;
		break;
	default:
L
Linus Torvalds 已提交
1609
		return -ENOPROTOOPT;
1610
	}
L
Linus Torvalds 已提交
1611 1612 1613 1614 1615 1616

	if (get_user(olr, optlen))
		return -EFAULT;
	olr = min_t(unsigned int, olr, sizeof(int));
	if (olr < 0)
		return -EINVAL;
J
Jianjun Kong 已提交
1617
	if (put_user(olr, optlen))
L
Linus Torvalds 已提交
1618
		return -EFAULT;
J
Jianjun Kong 已提交
1619
	if (copy_to_user(optval, &val, olr))
L
Linus Torvalds 已提交
1620 1621 1622 1623
		return -EFAULT;
	return 0;
}

1624
/* The IP multicast ioctl support routines. */
L
Linus Torvalds 已提交
1625 1626 1627 1628 1629 1630
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
{
	struct sioc_sg_req sr;
	struct sioc_vif_req vr;
	struct vif_device *vif;
	struct mfc_cache *c;
1631
	struct net *net = sock_net(sk);
1632 1633 1634
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1635
	if (!mrt)
1636
		return -ENOENT;
1637

S
Stephen Hemminger 已提交
1638 1639
	switch (cmd) {
	case SIOCGETVIFCNT:
J
Jianjun Kong 已提交
1640
		if (copy_from_user(&vr, arg, sizeof(vr)))
S
Stephen Hemminger 已提交
1641
			return -EFAULT;
1642
		if (vr.vifi >= mrt->maxvif)
S
Stephen Hemminger 已提交
1643 1644
			return -EINVAL;
		read_lock(&mrt_lock);
1645 1646
		vif = &mrt->vif_table[vr.vifi];
		if (VIF_EXISTS(mrt, vr.vifi)) {
J
Jianjun Kong 已提交
1647 1648 1649 1650
			vr.icount = vif->pkt_in;
			vr.ocount = vif->pkt_out;
			vr.ibytes = vif->bytes_in;
			vr.obytes = vif->bytes_out;
L
Linus Torvalds 已提交
1651 1652
			read_unlock(&mrt_lock);

J
Jianjun Kong 已提交
1653
			if (copy_to_user(arg, &vr, sizeof(vr)))
S
Stephen Hemminger 已提交
1654 1655 1656 1657 1658 1659
				return -EFAULT;
			return 0;
		}
		read_unlock(&mrt_lock);
		return -EADDRNOTAVAIL;
	case SIOCGETSGCNT:
J
Jianjun Kong 已提交
1660
		if (copy_from_user(&sr, arg, sizeof(sr)))
S
Stephen Hemminger 已提交
1661 1662
			return -EFAULT;

1663
		rcu_read_lock();
1664
		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
S
Stephen Hemminger 已提交
1665
		if (c) {
1666 1667 1668
			sr.pktcnt = c->_c.mfc_un.res.pkt;
			sr.bytecnt = c->_c.mfc_un.res.bytes;
			sr.wrong_if = c->_c.mfc_un.res.wrong_if;
1669
			rcu_read_unlock();
S
Stephen Hemminger 已提交
1670

J
Jianjun Kong 已提交
1671
			if (copy_to_user(arg, &sr, sizeof(sr)))
S
Stephen Hemminger 已提交
1672 1673 1674
				return -EFAULT;
			return 0;
		}
1675
		rcu_read_unlock();
S
Stephen Hemminger 已提交
1676 1677 1678
		return -EADDRNOTAVAIL;
	default:
		return -ENOIOCTLCMD;
L
Linus Torvalds 已提交
1679 1680 1681
	}
}

1682 1683 1684 1685 1686 1687 1688 1689 1690
#ifdef CONFIG_COMPAT
struct compat_sioc_sg_req {
	struct in_addr src;
	struct in_addr grp;
	compat_ulong_t pktcnt;
	compat_ulong_t bytecnt;
	compat_ulong_t wrong_if;
};

1691 1692 1693 1694 1695 1696 1697 1698
struct compat_sioc_vif_req {
	vifi_t	vifi;		/* Which iface */
	compat_ulong_t icount;
	compat_ulong_t ocount;
	compat_ulong_t ibytes;
	compat_ulong_t obytes;
};

1699 1700
int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
{
1701
	struct compat_sioc_sg_req sr;
1702 1703
	struct compat_sioc_vif_req vr;
	struct vif_device *vif;
1704 1705 1706 1707 1708
	struct mfc_cache *c;
	struct net *net = sock_net(sk);
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1709
	if (!mrt)
1710 1711 1712
		return -ENOENT;

	switch (cmd) {
1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732
	case SIOCGETVIFCNT:
		if (copy_from_user(&vr, arg, sizeof(vr)))
			return -EFAULT;
		if (vr.vifi >= mrt->maxvif)
			return -EINVAL;
		read_lock(&mrt_lock);
		vif = &mrt->vif_table[vr.vifi];
		if (VIF_EXISTS(mrt, vr.vifi)) {
			vr.icount = vif->pkt_in;
			vr.ocount = vif->pkt_out;
			vr.ibytes = vif->bytes_in;
			vr.obytes = vif->bytes_out;
			read_unlock(&mrt_lock);

			if (copy_to_user(arg, &vr, sizeof(vr)))
				return -EFAULT;
			return 0;
		}
		read_unlock(&mrt_lock);
		return -EADDRNOTAVAIL;
1733 1734 1735 1736 1737 1738 1739
	case SIOCGETSGCNT:
		if (copy_from_user(&sr, arg, sizeof(sr)))
			return -EFAULT;

		rcu_read_lock();
		c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
		if (c) {
1740 1741 1742
			sr.pktcnt = c->_c.mfc_un.res.pkt;
			sr.bytecnt = c->_c.mfc_un.res.bytes;
			sr.wrong_if = c->_c.mfc_un.res.wrong_if;
1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756
			rcu_read_unlock();

			if (copy_to_user(arg, &sr, sizeof(sr)))
				return -EFAULT;
			return 0;
		}
		rcu_read_unlock();
		return -EADDRNOTAVAIL;
	default:
		return -ENOIOCTLCMD;
	}
}
#endif

L
Linus Torvalds 已提交
1757 1758
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
1759
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
1760
	struct net *net = dev_net(dev);
1761
	struct mr_table *mrt;
L
Linus Torvalds 已提交
1762 1763
	struct vif_device *v;
	int ct;
1764

L
Linus Torvalds 已提交
1765 1766
	if (event != NETDEV_UNREGISTER)
		return NOTIFY_DONE;
1767 1768 1769 1770 1771

	ipmr_for_each_table(mrt, net) {
		v = &mrt->vif_table[0];
		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
			if (v->dev == dev)
1772
				vif_delete(mrt, ct, 1, NULL);
1773
		}
L
Linus Torvalds 已提交
1774 1775 1776 1777
	}
	return NOTIFY_DONE;
}

J
Jianjun Kong 已提交
1778
static struct notifier_block ip_mr_notifier = {
L
Linus Torvalds 已提交
1779 1780 1781
	.notifier_call = ipmr_device_event,
};

1782 1783 1784
/* Encapsulate a packet by attaching a valid IPIP header to it.
 * This avoids tunnel drivers and other mess and gives us the speed so
 * important for multicast video.
L
Linus Torvalds 已提交
1785
 */
1786 1787
static void ip_encap(struct net *net, struct sk_buff *skb,
		     __be32 saddr, __be32 daddr)
L
Linus Torvalds 已提交
1788
{
1789
	struct iphdr *iph;
1790
	const struct iphdr *old_iph = ip_hdr(skb);
1791 1792

	skb_push(skb, sizeof(struct iphdr));
1793
	skb->transport_header = skb->network_header;
1794
	skb_reset_network_header(skb);
1795
	iph = ip_hdr(skb);
L
Linus Torvalds 已提交
1796

E
Eric Dumazet 已提交
1797
	iph->version	=	4;
1798 1799
	iph->tos	=	old_iph->tos;
	iph->ttl	=	old_iph->ttl;
L
Linus Torvalds 已提交
1800 1801 1802 1803 1804 1805
	iph->frag_off	=	0;
	iph->daddr	=	daddr;
	iph->saddr	=	saddr;
	iph->protocol	=	IPPROTO_IPIP;
	iph->ihl	=	5;
	iph->tot_len	=	htons(skb->len);
1806
	ip_select_ident(net, skb, NULL);
L
Linus Torvalds 已提交
1807 1808 1809 1810 1811 1812
	ip_send_check(iph);

	memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
	nf_reset(skb);
}

1813 1814
static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
				      struct sk_buff *skb)
L
Linus Torvalds 已提交
1815
{
E
Eric Dumazet 已提交
1816
	struct ip_options *opt = &(IPCB(skb)->opt);
L
Linus Torvalds 已提交
1817

1818 1819
	IP_INC_STATS(net, IPSTATS_MIB_OUTFORWDATAGRAMS);
	IP_ADD_STATS(net, IPSTATS_MIB_OUTOCTETS, skb->len);
L
Linus Torvalds 已提交
1820 1821 1822 1823

	if (unlikely(opt->optlen))
		ip_forward_options(skb);

1824
	return dst_output(net, sk, skb);
L
Linus Torvalds 已提交
1825 1826
}

1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848
#ifdef CONFIG_NET_SWITCHDEV
static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
				   int in_vifi, int out_vifi)
{
	struct vif_device *out_vif = &mrt->vif_table[out_vifi];
	struct vif_device *in_vif = &mrt->vif_table[in_vifi];

	if (!skb->offload_mr_fwd_mark)
		return false;
	if (!out_vif->dev_parent_id.id_len || !in_vif->dev_parent_id.id_len)
		return false;
	return netdev_phys_item_id_same(&out_vif->dev_parent_id,
					&in_vif->dev_parent_id);
}
#else
static bool ipmr_forward_offloaded(struct sk_buff *skb, struct mr_table *mrt,
				   int in_vifi, int out_vifi)
{
	return false;
}
#endif

1849
/* Processing handlers for ipmr_forward */
L
Linus Torvalds 已提交
1850

1851
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1852 1853
			    int in_vifi, struct sk_buff *skb,
			    struct mfc_cache *c, int vifi)
L
Linus Torvalds 已提交
1854
{
1855
	const struct iphdr *iph = ip_hdr(skb);
1856
	struct vif_device *vif = &mrt->vif_table[vifi];
L
Linus Torvalds 已提交
1857 1858
	struct net_device *dev;
	struct rtable *rt;
1859
	struct flowi4 fl4;
L
Linus Torvalds 已提交
1860 1861
	int    encap = 0;

1862
	if (!vif->dev)
L
Linus Torvalds 已提交
1863 1864 1865 1866
		goto out_free;

	if (vif->flags & VIFF_REGISTER) {
		vif->pkt_out++;
J
Jianjun Kong 已提交
1867
		vif->bytes_out += skb->len;
1868 1869
		vif->dev->stats.tx_bytes += skb->len;
		vif->dev->stats.tx_packets++;
1870
		ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1871
		goto out_free;
L
Linus Torvalds 已提交
1872 1873
	}

1874 1875 1876
	if (ipmr_forward_offloaded(skb, mrt, in_vifi, vifi))
		goto out_free;

E
Eric Dumazet 已提交
1877
	if (vif->flags & VIFF_TUNNEL) {
1878
		rt = ip_route_output_ports(net, &fl4, NULL,
1879 1880 1881 1882
					   vif->remote, vif->local,
					   0, 0,
					   IPPROTO_IPIP,
					   RT_TOS(iph->tos), vif->link);
1883
		if (IS_ERR(rt))
L
Linus Torvalds 已提交
1884 1885 1886
			goto out_free;
		encap = sizeof(struct iphdr);
	} else {
1887
		rt = ip_route_output_ports(net, &fl4, NULL, iph->daddr, 0,
1888 1889 1890
					   0, 0,
					   IPPROTO_IPIP,
					   RT_TOS(iph->tos), vif->link);
1891
		if (IS_ERR(rt))
L
Linus Torvalds 已提交
1892 1893 1894
			goto out_free;
	}

1895
	dev = rt->dst.dev;
L
Linus Torvalds 已提交
1896

1897
	if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) {
L
Linus Torvalds 已提交
1898
		/* Do not fragment multicasts. Alas, IPv4 does not
E
Eric Dumazet 已提交
1899 1900
		 * allow to send ICMP, so that packets will disappear
		 * to blackhole.
L
Linus Torvalds 已提交
1901
		 */
1902
		IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
L
Linus Torvalds 已提交
1903 1904 1905 1906
		ip_rt_put(rt);
		goto out_free;
	}

1907
	encap += LL_RESERVED_SPACE(dev) + rt->dst.header_len;
L
Linus Torvalds 已提交
1908 1909

	if (skb_cow(skb, encap)) {
1910
		ip_rt_put(rt);
L
Linus Torvalds 已提交
1911 1912 1913 1914
		goto out_free;
	}

	vif->pkt_out++;
J
Jianjun Kong 已提交
1915
	vif->bytes_out += skb->len;
L
Linus Torvalds 已提交
1916

E
Eric Dumazet 已提交
1917
	skb_dst_drop(skb);
1918
	skb_dst_set(skb, &rt->dst);
1919
	ip_decrease_ttl(ip_hdr(skb));
L
Linus Torvalds 已提交
1920 1921

	/* FIXME: forward and output firewalls used to be called here.
E
Eric Dumazet 已提交
1922 1923
	 * What do we do with netfilter? -- RR
	 */
L
Linus Torvalds 已提交
1924
	if (vif->flags & VIFF_TUNNEL) {
1925
		ip_encap(net, skb, vif->local, vif->remote);
L
Linus Torvalds 已提交
1926
		/* FIXME: extra output firewall step used to be here. --RR */
1927 1928
		vif->dev->stats.tx_packets++;
		vif->dev->stats.tx_bytes += skb->len;
L
Linus Torvalds 已提交
1929 1930
	}

1931
	IPCB(skb)->flags |= IPSKB_FORWARDED;
L
Linus Torvalds 已提交
1932

1933
	/* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
L
Linus Torvalds 已提交
1934 1935 1936 1937 1938 1939 1940 1941 1942
	 * not only before forwarding, but after forwarding on all output
	 * interfaces. It is clear, if mrouter runs a multicasting
	 * program, it should receive packets not depending to what interface
	 * program is joined.
	 * If we will not make it, the program will have to join on all
	 * interfaces. On the other hand, multihoming host (or router, but
	 * not mrouter) cannot join to more than one interface - it will
	 * result in receiving multiple packets.
	 */
1943 1944
	NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD,
		net, NULL, skb, skb->dev, dev,
L
Linus Torvalds 已提交
1945 1946 1947 1948 1949 1950 1951
		ipmr_forward_finish);
	return;

out_free:
	kfree_skb(skb);
}

1952
static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
L
Linus Torvalds 已提交
1953 1954
{
	int ct;
1955 1956 1957

	for (ct = mrt->maxvif-1; ct >= 0; ct--) {
		if (mrt->vif_table[ct].dev == dev)
L
Linus Torvalds 已提交
1958 1959 1960 1961 1962 1963
			break;
	}
	return ct;
}

/* "local" means that we should preserve one skb (for local delivery) */
1964
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
1965
			  struct net_device *dev, struct sk_buff *skb,
1966
			  struct mfc_cache *c, int local)
L
Linus Torvalds 已提交
1967
{
1968
	int true_vifi = ipmr_find_vif(mrt, dev);
L
Linus Torvalds 已提交
1969 1970 1971
	int psend = -1;
	int vif, ct;

1972 1973 1974 1975
	vif = c->_c.mfc_parent;
	c->_c.mfc_un.res.pkt++;
	c->_c.mfc_un.res.bytes += skb->len;
	c->_c.mfc_un.res.lastuse = jiffies;
L
Linus Torvalds 已提交
1976

1977
	if (c->mfc_origin == htonl(INADDR_ANY) && true_vifi >= 0) {
1978 1979 1980 1981 1982
		struct mfc_cache *cache_proxy;

		/* For an (*,G) entry, we only check that the incomming
		 * interface is part of the static tree.
		 */
1983
		cache_proxy = mr_mfc_find_any_parent(mrt, vif);
1984
		if (cache_proxy &&
1985
		    cache_proxy->_c.mfc_un.res.ttls[true_vifi] < 255)
1986 1987 1988
			goto forward;
	}

1989
	/* Wrong interface: drop packet and (maybe) send PIM assert. */
1990
	if (mrt->vif_table[vif].dev != dev) {
1991
		if (rt_is_output_route(skb_rtable(skb))) {
L
Linus Torvalds 已提交
1992
			/* It is our own packet, looped back.
E
Eric Dumazet 已提交
1993 1994 1995 1996 1997 1998 1999 2000 2001
			 * Very complicated situation...
			 *
			 * The best workaround until routing daemons will be
			 * fixed is not to redistribute packet, if it was
			 * send through wrong interface. It means, that
			 * multicast applications WILL NOT work for
			 * (S,G), which have default multicast route pointing
			 * to wrong oif. In any case, it is not a good
			 * idea to use multicasting applications on router.
L
Linus Torvalds 已提交
2002 2003 2004 2005
			 */
			goto dont_forward;
		}

2006
		c->_c.mfc_un.res.wrong_if++;
L
Linus Torvalds 已提交
2007

2008
		if (true_vifi >= 0 && mrt->mroute_do_assert &&
L
Linus Torvalds 已提交
2009
		    /* pimsm uses asserts, when switching from RPT to SPT,
E
Eric Dumazet 已提交
2010 2011 2012
		     * so that we cannot check that packet arrived on an oif.
		     * It is bad, but otherwise we would need to move pretty
		     * large chunk of pimd to kernel. Ough... --ANK
L
Linus Torvalds 已提交
2013
		     */
2014
		    (mrt->mroute_do_pim ||
2015
		     c->_c.mfc_un.res.ttls[true_vifi] < 255) &&
2016
		    time_after(jiffies,
2017 2018 2019
			       c->_c.mfc_un.res.last_assert +
			       MFC_ASSERT_THRESH)) {
			c->_c.mfc_un.res.last_assert = jiffies;
2020
			ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
L
Linus Torvalds 已提交
2021 2022 2023 2024
		}
		goto dont_forward;
	}

2025
forward:
2026 2027
	mrt->vif_table[vif].pkt_in++;
	mrt->vif_table[vif].bytes_in += skb->len;
L
Linus Torvalds 已提交
2028

2029
	/* Forward the frame */
2030 2031
	if (c->mfc_origin == htonl(INADDR_ANY) &&
	    c->mfc_mcastgrp == htonl(INADDR_ANY)) {
2032
		if (true_vifi >= 0 &&
2033
		    true_vifi != c->_c.mfc_parent &&
2034
		    ip_hdr(skb)->ttl >
2035
				c->_c.mfc_un.res.ttls[c->_c.mfc_parent]) {
2036 2037 2038 2039
			/* It's an (*,*) entry and the packet is not coming from
			 * the upstream: forward the packet to the upstream
			 * only.
			 */
2040
			psend = c->_c.mfc_parent;
2041 2042 2043 2044
			goto last_forward;
		}
		goto dont_forward;
	}
2045 2046
	for (ct = c->_c.mfc_un.res.maxvif - 1;
	     ct >= c->_c.mfc_un.res.minvif; ct--) {
2047
		/* For (*,G) entry, don't forward to the incoming interface */
2048
		if ((c->mfc_origin != htonl(INADDR_ANY) ||
2049
		     ct != true_vifi) &&
2050
		    ip_hdr(skb)->ttl > c->_c.mfc_un.res.ttls[ct]) {
L
Linus Torvalds 已提交
2051 2052
			if (psend != -1) {
				struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
E
Eric Dumazet 已提交
2053

L
Linus Torvalds 已提交
2054
				if (skb2)
2055
					ipmr_queue_xmit(net, mrt, true_vifi,
2056
							skb2, c, psend);
L
Linus Torvalds 已提交
2057
			}
J
Jianjun Kong 已提交
2058
			psend = ct;
L
Linus Torvalds 已提交
2059 2060
		}
	}
2061
last_forward:
L
Linus Torvalds 已提交
2062 2063 2064
	if (psend != -1) {
		if (local) {
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
E
Eric Dumazet 已提交
2065

L
Linus Torvalds 已提交
2066
			if (skb2)
2067
				ipmr_queue_xmit(net, mrt, true_vifi, skb2,
2068
						c, psend);
L
Linus Torvalds 已提交
2069
		} else {
2070
			ipmr_queue_xmit(net, mrt, true_vifi, skb, c, psend);
2071
			return;
L
Linus Torvalds 已提交
2072 2073 2074 2075 2076 2077 2078 2079
		}
	}

dont_forward:
	if (!local)
		kfree_skb(skb);
}

2080
static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
2081
{
2082 2083
	struct rtable *rt = skb_rtable(skb);
	struct iphdr *iph = ip_hdr(skb);
D
David S. Miller 已提交
2084
	struct flowi4 fl4 = {
2085 2086
		.daddr = iph->daddr,
		.saddr = iph->saddr,
2087
		.flowi4_tos = RT_TOS(iph->tos),
D
David S. Miller 已提交
2088 2089 2090
		.flowi4_oif = (rt_is_output_route(rt) ?
			       skb->dev->ifindex : 0),
		.flowi4_iif = (rt_is_output_route(rt) ?
2091
			       LOOPBACK_IFINDEX :
D
David S. Miller 已提交
2092
			       skb->dev->ifindex),
2093
		.flowi4_mark = skb->mark,
2094 2095 2096 2097
	};
	struct mr_table *mrt;
	int err;

D
David S. Miller 已提交
2098
	err = ipmr_fib_lookup(net, &fl4, &mrt);
2099 2100 2101 2102
	if (err)
		return ERR_PTR(err);
	return mrt;
}
L
Linus Torvalds 已提交
2103

2104 2105
/* Multicast packets for forwarding arrive here
 * Called with rcu_read_lock();
L
Linus Torvalds 已提交
2106 2107 2108 2109
 */
int ip_mr_input(struct sk_buff *skb)
{
	struct mfc_cache *cache;
2110
	struct net *net = dev_net(skb->dev);
E
Eric Dumazet 已提交
2111
	int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
2112
	struct mr_table *mrt;
2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126
	struct net_device *dev;

	/* skb->dev passed in is the loX master dev for vrfs.
	 * As there are no vifs associated with loopback devices,
	 * get the proper interface that does have a vif associated with it.
	 */
	dev = skb->dev;
	if (netif_is_l3_master(skb->dev)) {
		dev = dev_get_by_index_rcu(net, IPCB(skb)->iif);
		if (!dev) {
			kfree_skb(skb);
			return -ENODEV;
		}
	}
L
Linus Torvalds 已提交
2127 2128

	/* Packet is looped back after forward, it should not be
E
Eric Dumazet 已提交
2129
	 * forwarded second time, but still can be delivered locally.
L
Linus Torvalds 已提交
2130
	 */
E
Eric Dumazet 已提交
2131
	if (IPCB(skb)->flags & IPSKB_FORWARDED)
L
Linus Torvalds 已提交
2132 2133
		goto dont_forward;

2134
	mrt = ipmr_rt_fib_lookup(net, skb);
2135 2136 2137
	if (IS_ERR(mrt)) {
		kfree_skb(skb);
		return PTR_ERR(mrt);
2138
	}
L
Linus Torvalds 已提交
2139
	if (!local) {
E
Eric Dumazet 已提交
2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157
		if (IPCB(skb)->opt.router_alert) {
			if (ip_call_ra_chain(skb))
				return 0;
		} else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) {
			/* IGMPv1 (and broken IGMPv2 implementations sort of
			 * Cisco IOS <= 11.2(8)) do not put router alert
			 * option to IGMP packets destined to routable
			 * groups. It is very bad, because it means
			 * that we can forward NO IGMP messages.
			 */
			struct sock *mroute_sk;

			mroute_sk = rcu_dereference(mrt->mroute_sk);
			if (mroute_sk) {
				nf_reset(skb);
				raw_rcv(mroute_sk, skb);
				return 0;
			}
L
Linus Torvalds 已提交
2158 2159 2160
		    }
	}

2161
	/* already under rcu_read_lock() */
2162
	cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
2163
	if (!cache) {
2164
		int vif = ipmr_find_vif(mrt, dev);
2165 2166 2167 2168 2169

		if (vif >= 0)
			cache = ipmr_cache_find_any(mrt, ip_hdr(skb)->daddr,
						    vif);
	}
L
Linus Torvalds 已提交
2170

2171
	/* No usable cache entry */
2172
	if (!cache) {
L
Linus Torvalds 已提交
2173 2174 2175 2176 2177
		int vif;

		if (local) {
			struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
			ip_local_deliver(skb);
2178
			if (!skb2)
L
Linus Torvalds 已提交
2179 2180 2181 2182
				return -ENOBUFS;
			skb = skb2;
		}

2183
		read_lock(&mrt_lock);
2184
		vif = ipmr_find_vif(mrt, dev);
L
Linus Torvalds 已提交
2185
		if (vif >= 0) {
2186
			int err2 = ipmr_cache_unresolved(mrt, vif, skb, dev);
L
Linus Torvalds 已提交
2187 2188
			read_unlock(&mrt_lock);

2189
			return err2;
L
Linus Torvalds 已提交
2190 2191 2192 2193 2194 2195
		}
		read_unlock(&mrt_lock);
		kfree_skb(skb);
		return -ENODEV;
	}

2196
	read_lock(&mrt_lock);
2197
	ip_mr_forward(net, mrt, dev, skb, cache, local);
L
Linus Torvalds 已提交
2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211
	read_unlock(&mrt_lock);

	if (local)
		return ip_local_deliver(skb);

	return 0;

dont_forward:
	if (local)
		return ip_local_deliver(skb);
	kfree_skb(skb);
	return 0;
}

I
Ilpo Järvinen 已提交
2212
#ifdef CONFIG_IP_PIMSM_V1
2213
/* Handle IGMP messages of PIMv1 */
E
Eric Dumazet 已提交
2214
int pim_rcv_v1(struct sk_buff *skb)
I
Ilpo Järvinen 已提交
2215 2216
{
	struct igmphdr *pim;
2217
	struct net *net = dev_net(skb->dev);
2218
	struct mr_table *mrt;
I
Ilpo Järvinen 已提交
2219 2220 2221 2222 2223 2224

	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
		goto drop;

	pim = igmp_hdr(skb);

2225
	mrt = ipmr_rt_fib_lookup(net, skb);
2226 2227
	if (IS_ERR(mrt))
		goto drop;
2228
	if (!mrt->mroute_do_pim ||
I
Ilpo Järvinen 已提交
2229 2230 2231
	    pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
		goto drop;

2232
	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
I
Ilpo Järvinen 已提交
2233 2234 2235
drop:
		kfree_skb(skb);
	}
L
Linus Torvalds 已提交
2236 2237 2238 2239 2240
	return 0;
}
#endif

#ifdef CONFIG_IP_PIMSM_V2
E
Eric Dumazet 已提交
2241
static int pim_rcv(struct sk_buff *skb)
L
Linus Torvalds 已提交
2242 2243
{
	struct pimreghdr *pim;
2244 2245
	struct net *net = dev_net(skb->dev);
	struct mr_table *mrt;
L
Linus Torvalds 已提交
2246

I
Ilpo Järvinen 已提交
2247
	if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
L
Linus Torvalds 已提交
2248 2249
		goto drop;

2250
	pim = (struct pimreghdr *)skb_transport_header(skb);
2251
	if (pim->type != ((PIM_VERSION << 4) | (PIM_TYPE_REGISTER)) ||
E
Eric Dumazet 已提交
2252
	    (pim->flags & PIM_NULL_REGISTER) ||
2253
	    (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
2254
	     csum_fold(skb_checksum(skb, 0, skb->len, 0))))
L
Linus Torvalds 已提交
2255 2256
		goto drop;

2257
	mrt = ipmr_rt_fib_lookup(net, skb);
2258 2259
	if (IS_ERR(mrt))
		goto drop;
2260
	if (__pim_rcv(mrt, skb, sizeof(*pim))) {
I
Ilpo Järvinen 已提交
2261 2262 2263
drop:
		kfree_skb(skb);
	}
L
Linus Torvalds 已提交
2264 2265 2266 2267
	return 0;
}
#endif

2268 2269
int ipmr_get_route(struct net *net, struct sk_buff *skb,
		   __be32 saddr, __be32 daddr,
2270
		   struct rtmsg *rtm, u32 portid)
L
Linus Torvalds 已提交
2271 2272
{
	struct mfc_cache *cache;
2273 2274
	struct mr_table *mrt;
	int err;
L
Linus Torvalds 已提交
2275

2276
	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2277
	if (!mrt)
2278 2279
		return -ENOENT;

2280
	rcu_read_lock();
2281
	cache = ipmr_cache_find(mrt, saddr, daddr);
2282
	if (!cache && skb->dev) {
2283
		int vif = ipmr_find_vif(mrt, skb->dev);
L
Linus Torvalds 已提交
2284

2285 2286 2287
		if (vif >= 0)
			cache = ipmr_cache_find_any(mrt, daddr, vif);
	}
2288
	if (!cache) {
2289
		struct sk_buff *skb2;
2290
		struct iphdr *iph;
L
Linus Torvalds 已提交
2291
		struct net_device *dev;
E
Eric Dumazet 已提交
2292
		int vif = -1;
L
Linus Torvalds 已提交
2293 2294

		dev = skb->dev;
2295
		read_lock(&mrt_lock);
E
Eric Dumazet 已提交
2296 2297 2298
		if (dev)
			vif = ipmr_find_vif(mrt, dev);
		if (vif < 0) {
L
Linus Torvalds 已提交
2299
			read_unlock(&mrt_lock);
2300
			rcu_read_unlock();
L
Linus Torvalds 已提交
2301 2302
			return -ENODEV;
		}
2303 2304 2305
		skb2 = skb_clone(skb, GFP_ATOMIC);
		if (!skb2) {
			read_unlock(&mrt_lock);
2306
			rcu_read_unlock();
2307 2308 2309
			return -ENOMEM;
		}

2310
		NETLINK_CB(skb2).portid = portid;
2311 2312
		skb_push(skb2, sizeof(struct iphdr));
		skb_reset_network_header(skb2);
2313 2314
		iph = ip_hdr(skb2);
		iph->ihl = sizeof(struct iphdr) >> 2;
2315 2316
		iph->saddr = saddr;
		iph->daddr = daddr;
2317
		iph->version = 0;
2318
		err = ipmr_cache_unresolved(mrt, vif, skb2, dev);
L
Linus Torvalds 已提交
2319
		read_unlock(&mrt_lock);
2320
		rcu_read_unlock();
L
Linus Torvalds 已提交
2321 2322 2323
		return err;
	}

2324
	read_lock(&mrt_lock);
Y
Yuval Mintz 已提交
2325
	err = mr_fill_mroute(mrt, skb, &cache->_c, rtm);
L
Linus Torvalds 已提交
2326
	read_unlock(&mrt_lock);
2327
	rcu_read_unlock();
L
Linus Torvalds 已提交
2328 2329 2330
	return err;
}

2331
static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2332 2333
			    u32 portid, u32 seq, struct mfc_cache *c, int cmd,
			    int flags)
2334 2335 2336
{
	struct nlmsghdr *nlh;
	struct rtmsg *rtm;
2337
	int err;
2338

2339
	nlh = nlmsg_put(skb, portid, seq, cmd, sizeof(*rtm), flags);
2340
	if (!nlh)
2341 2342 2343 2344 2345 2346 2347 2348
		return -EMSGSIZE;

	rtm = nlmsg_data(nlh);
	rtm->rtm_family   = RTNL_FAMILY_IPMR;
	rtm->rtm_dst_len  = 32;
	rtm->rtm_src_len  = 32;
	rtm->rtm_tos      = 0;
	rtm->rtm_table    = mrt->id;
D
David S. Miller 已提交
2349 2350
	if (nla_put_u32(skb, RTA_TABLE, mrt->id))
		goto nla_put_failure;
2351 2352
	rtm->rtm_type     = RTN_MULTICAST;
	rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2353
	if (c->_c.mfc_flags & MFC_STATIC)
2354 2355 2356
		rtm->rtm_protocol = RTPROT_STATIC;
	else
		rtm->rtm_protocol = RTPROT_MROUTED;
2357 2358
	rtm->rtm_flags    = 0;

2359 2360
	if (nla_put_in_addr(skb, RTA_SRC, c->mfc_origin) ||
	    nla_put_in_addr(skb, RTA_DST, c->mfc_mcastgrp))
D
David S. Miller 已提交
2361
		goto nla_put_failure;
Y
Yuval Mintz 已提交
2362
	err = mr_fill_mroute(mrt, skb, &c->_c, rtm);
2363 2364
	/* do not break the dump if cache is unresolved */
	if (err < 0 && err != -ENOENT)
2365 2366
		goto nla_put_failure;

2367 2368
	nlmsg_end(skb, nlh);
	return 0;
2369 2370 2371 2372 2373 2374

nla_put_failure:
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
}

Y
Yuval Mintz 已提交
2375 2376 2377 2378 2379 2380 2381 2382
static int _ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
			     u32 portid, u32 seq, struct mr_mfc *c, int cmd,
			     int flags)
{
	return ipmr_fill_mroute(mrt, skb, portid, seq, (struct mfc_cache *)c,
				cmd, flags);
}

2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397
static size_t mroute_msgsize(bool unresolved, int maxvif)
{
	size_t len =
		NLMSG_ALIGN(sizeof(struct rtmsg))
		+ nla_total_size(4)	/* RTA_TABLE */
		+ nla_total_size(4)	/* RTA_SRC */
		+ nla_total_size(4)	/* RTA_DST */
		;

	if (!unresolved)
		len = len
		      + nla_total_size(4)	/* RTA_IIF */
		      + nla_total_size(0)	/* RTA_MULTIPATH */
		      + maxvif * NLA_ALIGN(sizeof(struct rtnexthop))
						/* RTA_MFC_STATS */
2398
		      + nla_total_size_64bit(sizeof(struct rta_mfc_stats))
2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410
		;

	return len;
}

static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
				 int cmd)
{
	struct net *net = read_pnet(&mrt->net);
	struct sk_buff *skb;
	int err = -ENOBUFS;

2411 2412
	skb = nlmsg_new(mroute_msgsize(mfc->_c.mfc_parent >= MAXVIFS,
				       mrt->maxvif),
2413
			GFP_ATOMIC);
2414
	if (!skb)
2415 2416
		goto errout;

2417
	err = ipmr_fill_mroute(mrt, skb, 0, 0, mfc, cmd, 0);
2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429
	if (err < 0)
		goto errout;

	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE, NULL, GFP_ATOMIC);
	return;

errout:
	kfree_skb(skb);
	if (err < 0)
		rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE, err);
}

2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 2442 2443 2444 2445 2446 2447 2448 2449 2450 2451 2452 2453 2454 2455 2456 2457 2458 2459 2460 2461 2462 2463 2464 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492
static size_t igmpmsg_netlink_msgsize(size_t payloadlen)
{
	size_t len =
		NLMSG_ALIGN(sizeof(struct rtgenmsg))
		+ nla_total_size(1)	/* IPMRA_CREPORT_MSGTYPE */
		+ nla_total_size(4)	/* IPMRA_CREPORT_VIF_ID */
		+ nla_total_size(4)	/* IPMRA_CREPORT_SRC_ADDR */
		+ nla_total_size(4)	/* IPMRA_CREPORT_DST_ADDR */
					/* IPMRA_CREPORT_PKT */
		+ nla_total_size(payloadlen)
		;

	return len;
}

static void igmpmsg_netlink_event(struct mr_table *mrt, struct sk_buff *pkt)
{
	struct net *net = read_pnet(&mrt->net);
	struct nlmsghdr *nlh;
	struct rtgenmsg *rtgenm;
	struct igmpmsg *msg;
	struct sk_buff *skb;
	struct nlattr *nla;
	int payloadlen;

	payloadlen = pkt->len - sizeof(struct igmpmsg);
	msg = (struct igmpmsg *)skb_network_header(pkt);

	skb = nlmsg_new(igmpmsg_netlink_msgsize(payloadlen), GFP_ATOMIC);
	if (!skb)
		goto errout;

	nlh = nlmsg_put(skb, 0, 0, RTM_NEWCACHEREPORT,
			sizeof(struct rtgenmsg), 0);
	if (!nlh)
		goto errout;
	rtgenm = nlmsg_data(nlh);
	rtgenm->rtgen_family = RTNL_FAMILY_IPMR;
	if (nla_put_u8(skb, IPMRA_CREPORT_MSGTYPE, msg->im_msgtype) ||
	    nla_put_u32(skb, IPMRA_CREPORT_VIF_ID, msg->im_vif) ||
	    nla_put_in_addr(skb, IPMRA_CREPORT_SRC_ADDR,
			    msg->im_src.s_addr) ||
	    nla_put_in_addr(skb, IPMRA_CREPORT_DST_ADDR,
			    msg->im_dst.s_addr))
		goto nla_put_failure;

	nla = nla_reserve(skb, IPMRA_CREPORT_PKT, payloadlen);
	if (!nla || skb_copy_bits(pkt, sizeof(struct igmpmsg),
				  nla_data(nla), payloadlen))
		goto nla_put_failure;

	nlmsg_end(skb, nlh);

	rtnl_notify(skb, net, 0, RTNLGRP_IPV4_MROUTE_R, NULL, GFP_ATOMIC);
	return;

nla_put_failure:
	nlmsg_cancel(skb, nlh);
errout:
	kfree_skb(skb);
	rtnl_set_sk_err(net, RTNLGRP_IPV4_MROUTE_R, -ENOBUFS);
}

D
Donald Sharp 已提交
2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517
static int ipmr_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
			     struct netlink_ext_ack *extack)
{
	struct net *net = sock_net(in_skb->sk);
	struct nlattr *tb[RTA_MAX + 1];
	struct sk_buff *skb = NULL;
	struct mfc_cache *cache;
	struct mr_table *mrt;
	struct rtmsg *rtm;
	__be32 src, grp;
	u32 tableid;
	int err;

	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX,
			  rtm_ipv4_policy, extack);
	if (err < 0)
		goto errout;

	rtm = nlmsg_data(nlh);

	src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
	grp = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
	tableid = tb[RTA_TABLE] ? nla_get_u32(tb[RTA_TABLE]) : 0;

	mrt = ipmr_get_table(net, tableid ? tableid : RT_TABLE_DEFAULT);
2518 2519
	if (!mrt) {
		err = -ENOENT;
D
Donald Sharp 已提交
2520 2521 2522 2523 2524 2525 2526 2527 2528 2529 2530 2531 2532 2533 2534 2535 2536 2537 2538 2539 2540 2541 2542 2543 2544 2545 2546 2547 2548 2549 2550 2551 2552 2553
		goto errout_free;
	}

	/* entries are added/deleted only under RTNL */
	rcu_read_lock();
	cache = ipmr_cache_find(mrt, src, grp);
	rcu_read_unlock();
	if (!cache) {
		err = -ENOENT;
		goto errout_free;
	}

	skb = nlmsg_new(mroute_msgsize(false, mrt->maxvif), GFP_KERNEL);
	if (!skb) {
		err = -ENOBUFS;
		goto errout_free;
	}

	err = ipmr_fill_mroute(mrt, skb, NETLINK_CB(in_skb).portid,
			       nlh->nlmsg_seq, cache,
			       RTM_NEWROUTE, 0);
	if (err < 0)
		goto errout_free;

	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);

errout:
	return err;

errout_free:
	kfree_skb(skb);
	goto errout;
}

2554 2555
static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
{
Y
Yuval Mintz 已提交
2556 2557
	return mr_rtm_dumproute(skb, cb, ipmr_mr_table_iter,
				_ipmr_fill_mroute, &mfc_unres_lock);
2558 2559
}

2560 2561 2562 2563 2564 2565 2566 2567 2568 2569 2570 2571 2572 2573 2574 2575 2576 2577 2578 2579 2580 2581 2582 2583 2584 2585 2586 2587 2588 2589 2590 2591 2592 2593 2594 2595
static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
	[RTA_SRC]	= { .type = NLA_U32 },
	[RTA_DST]	= { .type = NLA_U32 },
	[RTA_IIF]	= { .type = NLA_U32 },
	[RTA_TABLE]	= { .type = NLA_U32 },
	[RTA_MULTIPATH]	= { .len = sizeof(struct rtnexthop) },
};

static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
{
	switch (rtm_protocol) {
	case RTPROT_STATIC:
	case RTPROT_MROUTED:
		return true;
	}
	return false;
}

static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
{
	struct rtnexthop *rtnh = nla_data(nla);
	int remaining = nla_len(nla), vifi = 0;

	while (rtnh_ok(rtnh, remaining)) {
		mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
		if (++vifi == MAXVIFS)
			break;
		rtnh = rtnh_next(rtnh, &remaining);
	}

	return remaining > 0 ? -EINVAL : vifi;
}

/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
			    struct mfcctl *mfcc, int *mrtsock,
2596 2597
			    struct mr_table **mrtret,
			    struct netlink_ext_ack *extack)
2598 2599 2600 2601 2602 2603 2604 2605
{
	struct net_device *dev = NULL;
	u32 tblid = RT_TABLE_DEFAULT;
	struct mr_table *mrt;
	struct nlattr *attr;
	struct rtmsg *rtm;
	int ret, rem;

2606
	ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy,
2607
			     extack);
2608 2609 2610 2611 2612 2613 2614 2615 2616 2617 2618 2619 2620 2621 2622 2623 2624 2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665
	if (ret < 0)
		goto out;
	rtm = nlmsg_data(nlh);

	ret = -EINVAL;
	if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
	    rtm->rtm_type != RTN_MULTICAST ||
	    rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
	    !ipmr_rtm_validate_proto(rtm->rtm_protocol))
		goto out;

	memset(mfcc, 0, sizeof(*mfcc));
	mfcc->mfcc_parent = -1;
	ret = 0;
	nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
		switch (nla_type(attr)) {
		case RTA_SRC:
			mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
			break;
		case RTA_DST:
			mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
			break;
		case RTA_IIF:
			dev = __dev_get_by_index(net, nla_get_u32(attr));
			if (!dev) {
				ret = -ENODEV;
				goto out;
			}
			break;
		case RTA_MULTIPATH:
			if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
				ret = -EINVAL;
				goto out;
			}
			break;
		case RTA_PREFSRC:
			ret = 1;
			break;
		case RTA_TABLE:
			tblid = nla_get_u32(attr);
			break;
		}
	}
	mrt = ipmr_get_table(net, tblid);
	if (!mrt) {
		ret = -ENOENT;
		goto out;
	}
	*mrtret = mrt;
	*mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
	if (dev)
		mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);

out:
	return ret;
}

/* takes care of both newroute and delroute */
2666 2667
static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh,
			  struct netlink_ext_ack *extack)
2668 2669 2670 2671 2672 2673 2674 2675
{
	struct net *net = sock_net(skb->sk);
	int ret, mrtsock, parent;
	struct mr_table *tbl;
	struct mfcctl mfcc;

	mrtsock = 0;
	tbl = NULL;
2676
	ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl, extack);
2677 2678 2679 2680 2681 2682 2683 2684 2685 2686
	if (ret < 0)
		return ret;

	parent = ret ? mfcc.mfcc_parent : -1;
	if (nlh->nlmsg_type == RTM_NEWROUTE)
		return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
	else
		return ipmr_mfc_delete(tbl, &mfcc, parent);
}

2687 2688 2689 2690 2691 2692 2693 2694 2695 2696 2697 2698 2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709 2710 2711 2712 2713 2714 2715 2716 2717 2718 2719 2720 2721 2722 2723 2724 2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749 2750 2751 2752 2753 2754 2755 2756 2757 2758 2759 2760 2761 2762 2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807 2808 2809
static bool ipmr_fill_table(struct mr_table *mrt, struct sk_buff *skb)
{
	u32 queue_len = atomic_read(&mrt->cache_resolve_queue_len);

	if (nla_put_u32(skb, IPMRA_TABLE_ID, mrt->id) ||
	    nla_put_u32(skb, IPMRA_TABLE_CACHE_RES_QUEUE_LEN, queue_len) ||
	    nla_put_s32(skb, IPMRA_TABLE_MROUTE_REG_VIF_NUM,
			mrt->mroute_reg_vif_num) ||
	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_ASSERT,
		       mrt->mroute_do_assert) ||
	    nla_put_u8(skb, IPMRA_TABLE_MROUTE_DO_PIM, mrt->mroute_do_pim))
		return false;

	return true;
}

static bool ipmr_fill_vif(struct mr_table *mrt, u32 vifid, struct sk_buff *skb)
{
	struct nlattr *vif_nest;
	struct vif_device *vif;

	/* if the VIF doesn't exist just continue */
	if (!VIF_EXISTS(mrt, vifid))
		return true;

	vif = &mrt->vif_table[vifid];
	vif_nest = nla_nest_start(skb, IPMRA_VIF);
	if (!vif_nest)
		return false;
	if (nla_put_u32(skb, IPMRA_VIFA_IFINDEX, vif->dev->ifindex) ||
	    nla_put_u32(skb, IPMRA_VIFA_VIF_ID, vifid) ||
	    nla_put_u16(skb, IPMRA_VIFA_FLAGS, vif->flags) ||
	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_IN, vif->bytes_in,
			      IPMRA_VIFA_PAD) ||
	    nla_put_u64_64bit(skb, IPMRA_VIFA_BYTES_OUT, vif->bytes_out,
			      IPMRA_VIFA_PAD) ||
	    nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_IN, vif->pkt_in,
			      IPMRA_VIFA_PAD) ||
	    nla_put_u64_64bit(skb, IPMRA_VIFA_PACKETS_OUT, vif->pkt_out,
			      IPMRA_VIFA_PAD) ||
	    nla_put_be32(skb, IPMRA_VIFA_LOCAL_ADDR, vif->local) ||
	    nla_put_be32(skb, IPMRA_VIFA_REMOTE_ADDR, vif->remote)) {
		nla_nest_cancel(skb, vif_nest);
		return false;
	}
	nla_nest_end(skb, vif_nest);

	return true;
}

static int ipmr_rtm_dumplink(struct sk_buff *skb, struct netlink_callback *cb)
{
	struct net *net = sock_net(skb->sk);
	struct nlmsghdr *nlh = NULL;
	unsigned int t = 0, s_t;
	unsigned int e = 0, s_e;
	struct mr_table *mrt;

	s_t = cb->args[0];
	s_e = cb->args[1];

	ipmr_for_each_table(mrt, net) {
		struct nlattr *vifs, *af;
		struct ifinfomsg *hdr;
		u32 i;

		if (t < s_t)
			goto skip_table;
		nlh = nlmsg_put(skb, NETLINK_CB(cb->skb).portid,
				cb->nlh->nlmsg_seq, RTM_NEWLINK,
				sizeof(*hdr), NLM_F_MULTI);
		if (!nlh)
			break;

		hdr = nlmsg_data(nlh);
		memset(hdr, 0, sizeof(*hdr));
		hdr->ifi_family = RTNL_FAMILY_IPMR;

		af = nla_nest_start(skb, IFLA_AF_SPEC);
		if (!af) {
			nlmsg_cancel(skb, nlh);
			goto out;
		}

		if (!ipmr_fill_table(mrt, skb)) {
			nlmsg_cancel(skb, nlh);
			goto out;
		}

		vifs = nla_nest_start(skb, IPMRA_TABLE_VIFS);
		if (!vifs) {
			nla_nest_end(skb, af);
			nlmsg_end(skb, nlh);
			goto out;
		}
		for (i = 0; i < mrt->maxvif; i++) {
			if (e < s_e)
				goto skip_entry;
			if (!ipmr_fill_vif(mrt, i, skb)) {
				nla_nest_end(skb, vifs);
				nla_nest_end(skb, af);
				nlmsg_end(skb, nlh);
				goto out;
			}
skip_entry:
			e++;
		}
		s_e = 0;
		e = 0;
		nla_nest_end(skb, vifs);
		nla_nest_end(skb, af);
		nlmsg_end(skb, nlh);
skip_table:
		t++;
	}

out:
	cb->args[1] = e;
	cb->args[0] = t;

	return skb->len;
}

2810
#ifdef CONFIG_PROC_FS
2811 2812
/* The /proc interfaces to multicast routing :
 * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
L
Linus Torvalds 已提交
2813 2814 2815
 */

static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
S
Stephen Hemminger 已提交
2816
	__acquires(mrt_lock)
L
Linus Torvalds 已提交
2817
{
2818
	struct mr_vif_iter *iter = seq->private;
2819
	struct net *net = seq_file_net(seq);
2820 2821 2822
	struct mr_table *mrt;

	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2823
	if (!mrt)
2824 2825 2826
		return ERR_PTR(-ENOENT);

	iter->mrt = mrt;
2827

L
Linus Torvalds 已提交
2828
	read_lock(&mrt_lock);
2829
	return mr_vif_seq_start(seq, pos);
L
Linus Torvalds 已提交
2830 2831 2832
}

static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
S
Stephen Hemminger 已提交
2833
	__releases(mrt_lock)
L
Linus Torvalds 已提交
2834 2835 2836 2837 2838 2839
{
	read_unlock(&mrt_lock);
}

static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
{
2840
	struct mr_vif_iter *iter = seq->private;
2841
	struct mr_table *mrt = iter->mrt;
2842

L
Linus Torvalds 已提交
2843
	if (v == SEQ_START_TOKEN) {
2844
		seq_puts(seq,
L
Linus Torvalds 已提交
2845 2846 2847
			 "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
	} else {
		const struct vif_device *vif = v;
2848 2849
		const char *name =  vif->dev ?
				    vif->dev->name : "none";
L
Linus Torvalds 已提交
2850 2851

		seq_printf(seq,
2852
			   "%2td %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2853
			   vif - mrt->vif_table,
2854
			   name, vif->bytes_in, vif->pkt_in,
L
Linus Torvalds 已提交
2855 2856 2857 2858 2859 2860
			   vif->bytes_out, vif->pkt_out,
			   vif->flags, vif->local, vif->remote);
	}
	return 0;
}

2861
static const struct seq_operations ipmr_vif_seq_ops = {
L
Linus Torvalds 已提交
2862
	.start = ipmr_vif_seq_start,
2863
	.next  = mr_vif_seq_next,
L
Linus Torvalds 已提交
2864 2865 2866 2867 2868 2869
	.stop  = ipmr_vif_seq_stop,
	.show  = ipmr_vif_seq_show,
};

static int ipmr_vif_open(struct inode *inode, struct file *file)
{
2870
	return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2871
			    sizeof(struct mr_vif_iter));
L
Linus Torvalds 已提交
2872 2873
}

2874
static const struct file_operations ipmr_vif_fops = {
L
Linus Torvalds 已提交
2875 2876 2877
	.open    = ipmr_vif_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
2878
	.release = seq_release_net,
L
Linus Torvalds 已提交
2879 2880 2881 2882
};

static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
{
2883
	struct net *net = seq_file_net(seq);
2884
	struct mr_table *mrt;
2885

2886
	mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2887
	if (!mrt)
2888
		return ERR_PTR(-ENOENT);
2889

Y
Yuval Mintz 已提交
2890
	return mr_mfc_seq_start(seq, pos, mrt, &mfc_unres_lock);
L
Linus Torvalds 已提交
2891 2892 2893 2894 2895 2896 2897
}

static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
{
	int n;

	if (v == SEQ_START_TOKEN) {
2898
		seq_puts(seq,
L
Linus Torvalds 已提交
2899 2900 2901
		 "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
	} else {
		const struct mfc_cache *mfc = v;
Y
Yuval Mintz 已提交
2902
		const struct mr_mfc_iter *it = seq->private;
2903
		const struct mr_table *mrt = it->mrt;
2904

2905 2906 2907
		seq_printf(seq, "%08X %08X %-3hd",
			   (__force u32) mfc->mfc_mcastgrp,
			   (__force u32) mfc->mfc_origin,
2908
			   mfc->_c.mfc_parent);
L
Linus Torvalds 已提交
2909

2910
		if (it->cache != &mrt->mfc_unres_queue) {
2911
			seq_printf(seq, " %8lu %8lu %8lu",
2912 2913 2914 2915 2916
				   mfc->_c.mfc_un.res.pkt,
				   mfc->_c.mfc_un.res.bytes,
				   mfc->_c.mfc_un.res.wrong_if);
			for (n = mfc->_c.mfc_un.res.minvif;
			     n < mfc->_c.mfc_un.res.maxvif; n++) {
2917
				if (VIF_EXISTS(mrt, n) &&
2918
				    mfc->_c.mfc_un.res.ttls[n] < 255)
2919
					seq_printf(seq,
2920
					   " %2d:%-3d",
2921
					   n, mfc->_c.mfc_un.res.ttls[n]);
L
Linus Torvalds 已提交
2922
			}
2923 2924 2925 2926 2927
		} else {
			/* unresolved mfc_caches don't contain
			 * pkt, bytes and wrong_if values
			 */
			seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
L
Linus Torvalds 已提交
2928 2929 2930 2931 2932 2933
		}
		seq_putc(seq, '\n');
	}
	return 0;
}

2934
static const struct seq_operations ipmr_mfc_seq_ops = {
L
Linus Torvalds 已提交
2935
	.start = ipmr_mfc_seq_start,
Y
Yuval Mintz 已提交
2936 2937
	.next  = mr_mfc_seq_next,
	.stop  = mr_mfc_seq_stop,
L
Linus Torvalds 已提交
2938 2939 2940 2941 2942
	.show  = ipmr_mfc_seq_show,
};

static int ipmr_mfc_open(struct inode *inode, struct file *file)
{
2943
	return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
Y
Yuval Mintz 已提交
2944
			    sizeof(struct mr_mfc_iter));
L
Linus Torvalds 已提交
2945 2946
}

2947
static const struct file_operations ipmr_mfc_fops = {
L
Linus Torvalds 已提交
2948 2949 2950
	.open    = ipmr_mfc_open,
	.read    = seq_read,
	.llseek  = seq_lseek,
2951
	.release = seq_release_net,
L
Linus Torvalds 已提交
2952
};
2953
#endif
L
Linus Torvalds 已提交
2954 2955

#ifdef CONFIG_IP_PIMSM_V2
2956
static const struct net_protocol pim_protocol = {
L
Linus Torvalds 已提交
2957
	.handler	=	pim_rcv,
T
Tom Goff 已提交
2958
	.netns_ok	=	1,
L
Linus Torvalds 已提交
2959 2960 2961
};
#endif

2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979
static unsigned int ipmr_seq_read(struct net *net)
{
	ASSERT_RTNL();

	return net->ipv4.ipmr_seq + ipmr_rules_seq_read(net);
}

static int ipmr_dump(struct net *net, struct notifier_block *nb)
{
	struct mr_table *mrt;
	int err;

	err = ipmr_rules_dump(net, nb);
	if (err)
		return err;

	ipmr_for_each_table(mrt, net) {
		struct vif_device *v = &mrt->vif_table[0];
2980
		struct mr_mfc *mfc;
2981 2982 2983 2984 2985 2986 2987 2988 2989 2990 2991 2992 2993 2994 2995 2996
		int vifi;

		/* Notifiy on table VIF entries */
		read_lock(&mrt_lock);
		for (vifi = 0; vifi < mrt->maxvif; vifi++, v++) {
			if (!v->dev)
				continue;

			call_ipmr_vif_entry_notifier(nb, net, FIB_EVENT_VIF_ADD,
						     v, vifi, mrt->id);
		}
		read_unlock(&mrt_lock);

		/* Notify on table MFC entries */
		list_for_each_entry_rcu(mfc, &mrt->mfc_cache_list, list)
			call_ipmr_mfc_entry_notifier(nb, net,
2997 2998
						     FIB_EVENT_ENTRY_ADD,
						     (struct mfc_cache *)mfc,
2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011
						     mrt->id);
	}

	return 0;
}

static const struct fib_notifier_ops ipmr_notifier_ops_template = {
	.family		= RTNL_FAMILY_IPMR,
	.fib_seq_read	= ipmr_seq_read,
	.fib_dump	= ipmr_dump,
	.owner		= THIS_MODULE,
};

3012
static int __net_init ipmr_notifier_init(struct net *net)
3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031
{
	struct fib_notifier_ops *ops;

	net->ipv4.ipmr_seq = 0;

	ops = fib_notifier_ops_register(&ipmr_notifier_ops_template, net);
	if (IS_ERR(ops))
		return PTR_ERR(ops);
	net->ipv4.ipmr_notifier_ops = ops;

	return 0;
}

static void __net_exit ipmr_notifier_exit(struct net *net)
{
	fib_notifier_ops_unregister(net->ipv4.ipmr_notifier_ops);
	net->ipv4.ipmr_notifier_ops = NULL;
}

3032
/* Setup for IP multicast routing */
3033 3034
static int __net_init ipmr_net_init(struct net *net)
{
3035
	int err;
3036

3037 3038 3039 3040
	err = ipmr_notifier_init(net);
	if (err)
		goto ipmr_notifier_fail;

3041 3042
	err = ipmr_rules_init(net);
	if (err < 0)
3043
		goto ipmr_rules_fail;
3044 3045 3046

#ifdef CONFIG_PROC_FS
	err = -ENOMEM;
3047
	if (!proc_create("ip_mr_vif", 0, net->proc_net, &ipmr_vif_fops))
3048
		goto proc_vif_fail;
3049
	if (!proc_create("ip_mr_cache", 0, net->proc_net, &ipmr_mfc_fops))
3050 3051
		goto proc_cache_fail;
#endif
3052 3053
	return 0;

3054 3055
#ifdef CONFIG_PROC_FS
proc_cache_fail:
3056
	remove_proc_entry("ip_mr_vif", net->proc_net);
3057
proc_vif_fail:
3058
	ipmr_rules_exit(net);
3059
#endif
3060 3061 3062
ipmr_rules_fail:
	ipmr_notifier_exit(net);
ipmr_notifier_fail:
3063 3064 3065 3066 3067
	return err;
}

static void __net_exit ipmr_net_exit(struct net *net)
{
3068
#ifdef CONFIG_PROC_FS
3069 3070
	remove_proc_entry("ip_mr_cache", net->proc_net);
	remove_proc_entry("ip_mr_vif", net->proc_net);
3071
#endif
3072
	ipmr_notifier_exit(net);
3073
	ipmr_rules_exit(net);
3074 3075 3076 3077 3078
}

static struct pernet_operations ipmr_net_ops = {
	.init = ipmr_net_init,
	.exit = ipmr_net_exit,
3079
	.async = true,
3080
};
3081

W
Wang Chen 已提交
3082
int __init ip_mr_init(void)
L
Linus Torvalds 已提交
3083
{
W
Wang Chen 已提交
3084 3085
	int err;

L
Linus Torvalds 已提交
3086 3087
	mrt_cachep = kmem_cache_create("ip_mrt_cache",
				       sizeof(struct mfc_cache),
3088
				       0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
3089
				       NULL);
W
Wang Chen 已提交
3090

3091 3092 3093 3094
	err = register_pernet_subsys(&ipmr_net_ops);
	if (err)
		goto reg_pernet_fail;

W
Wang Chen 已提交
3095 3096 3097
	err = register_netdevice_notifier(&ip_mr_notifier);
	if (err)
		goto reg_notif_fail;
T
Tom Goff 已提交
3098 3099
#ifdef CONFIG_IP_PIMSM_V2
	if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
J
Joe Perches 已提交
3100
		pr_err("%s: can't add PIM protocol\n", __func__);
T
Tom Goff 已提交
3101 3102 3103 3104
		err = -EAGAIN;
		goto add_proto_fail;
	}
#endif
3105
	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
3106
		      ipmr_rtm_getroute, ipmr_rtm_dumproute, 0);
3107
	rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
3108
		      ipmr_rtm_route, NULL, 0);
3109
	rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
3110
		      ipmr_rtm_route, NULL, 0);
3111 3112

	rtnl_register(RTNL_FAMILY_IPMR, RTM_GETLINK,
3113
		      NULL, ipmr_rtm_dumplink, 0);
W
Wang Chen 已提交
3114
	return 0;
3115

T
Tom Goff 已提交
3116 3117 3118 3119
#ifdef CONFIG_IP_PIMSM_V2
add_proto_fail:
	unregister_netdevice_notifier(&ip_mr_notifier);
#endif
B
Benjamin Thery 已提交
3120
reg_notif_fail:
3121 3122
	unregister_pernet_subsys(&ipmr_net_ops);
reg_pernet_fail:
B
Benjamin Thery 已提交
3123
	kmem_cache_destroy(mrt_cachep);
W
Wang Chen 已提交
3124
	return err;
L
Linus Torvalds 已提交
3125
}