route.c 129.3 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
L
Linus Torvalds 已提交
25 26
 */

27 28
#define pr_fmt(fmt) "IPv6: " fmt

29
#include <linux/capability.h>
L
Linus Torvalds 已提交
30
#include <linux/errno.h>
31
#include <linux/export.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
40
#include <linux/mroute6.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
45
#include <linux/nsproxy.h>
46
#include <linux/slab.h>
47
#include <linux/jhash.h>
48
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
49 50 51 52 53 54 55 56 57
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
58
#include <net/dst_metadata.h>
L
Linus Torvalds 已提交
59
#include <net/xfrm.h>
60
#include <net/netevent.h>
61
#include <net/netlink.h>
62
#include <net/nexthop.h>
63
#include <net/lwtunnel.h>
64
#include <net/ip_tunnels.h>
D
David Ahern 已提交
65
#include <net/l3mdev.h>
D
David Ahern 已提交
66
#include <trace/events/fib6.h>
L
Linus Torvalds 已提交
67

68
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
69 70 71 72 73

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

74
enum rt6_nud_state {
J
Jiri Benc 已提交
75 76 77
	RT6_NUD_FAIL_HARD = -3,
	RT6_NUD_FAIL_PROBE = -2,
	RT6_NUD_FAIL_DO_RR = -1,
78 79 80
	RT6_NUD_SUCCEED = 1
};

L
Linus Torvalds 已提交
81
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
L
Linus Torvalds 已提交
84 85 86 87
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
88
static int		 ip6_dst_gc(struct dst_ops *ops);
L
Linus Torvalds 已提交
89 90

static int		ip6_pkt_discard(struct sk_buff *skb);
E
Eric W. Biederman 已提交
91
static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92
static int		ip6_pkt_prohibit(struct sk_buff *skb);
E
Eric W. Biederman 已提交
93
static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
L
Linus Torvalds 已提交
94
static void		ip6_link_failure(struct sk_buff *skb);
95 96 97 98
static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
					   struct sk_buff *skb, u32 mtu);
static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
					struct sk_buff *skb);
99
static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
100
static size_t rt6_nlmsg_size(struct rt6_info *rt);
101 102 103
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
			 struct rt6_info *rt, struct dst_entry *dst,
			 struct in6_addr *dest, struct in6_addr *src,
104 105
			 int iif, int type, u32 portid, u32 seq,
			 unsigned int flags);
106 107 108
static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
					   struct in6_addr *daddr,
					   struct in6_addr *saddr);
L
Linus Torvalds 已提交
109

110
#ifdef CONFIG_IPV6_ROUTE_INFO
111
static struct rt6_info *rt6_add_route_info(struct net *net,
112
					   const struct in6_addr *prefix, int prefixlen,
113 114
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
115
					   unsigned int pref);
116
static struct rt6_info *rt6_get_route_info(struct net *net,
117
					   const struct in6_addr *prefix, int prefixlen,
118 119
					   const struct in6_addr *gwaddr,
					   struct net_device *dev);
120 121
#endif

122 123 124 125 126 127 128
struct uncached_list {
	spinlock_t		lock;
	struct list_head	head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

129
void rt6_uncached_list_add(struct rt6_info *rt)
130 131 132 133 134 135 136 137 138 139
{
	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

	rt->rt6i_uncached_list = ul;

	spin_lock_bh(&ul->lock);
	list_add_tail(&rt->rt6i_uncached, &ul->head);
	spin_unlock_bh(&ul->lock);
}

140
void rt6_uncached_list_del(struct rt6_info *rt)
141 142 143
{
	if (!list_empty(&rt->rt6i_uncached)) {
		struct uncached_list *ul = rt->rt6i_uncached_list;
W
Wei Wang 已提交
144
		struct net *net = dev_net(rt->dst.dev);
145 146 147

		spin_lock_bh(&ul->lock);
		list_del(&rt->rt6i_uncached);
W
Wei Wang 已提交
148
		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 150 151 152 153 154 155 156 157
		spin_unlock_bh(&ul->lock);
	}
}

static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
	struct net_device *loopback_dev = net->loopback_dev;
	int cpu;

158 159 160
	if (dev == loopback_dev)
		return;

161 162 163 164 165 166 167 168 169
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
		struct rt6_info *rt;

		spin_lock_bh(&ul->lock);
		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
			struct inet6_dev *rt_idev = rt->rt6i_idev;
			struct net_device *rt_dev = rt->dst.dev;

170
			if (rt_idev->dev == dev) {
171 172 173 174
				rt->rt6i_idev = in6_dev_get(loopback_dev);
				in6_dev_put(rt_idev);
			}

175
			if (rt_dev == dev) {
176 177 178 179 180 181 182 183 184
				rt->dst.dev = loopback_dev;
				dev_hold(rt->dst.dev);
				dev_put(rt_dev);
			}
		}
		spin_unlock_bh(&ul->lock);
	}
}

185 186 187
static inline const void *choose_neigh_daddr(struct rt6_info *rt,
					     struct sk_buff *skb,
					     const void *daddr)
188 189 190
{
	struct in6_addr *p = &rt->rt6i_gateway;

D
David S. Miller 已提交
191
	if (!ipv6_addr_any(p))
192
		return (const void *) p;
193 194
	else if (skb)
		return &ipv6_hdr(skb)->daddr;
195 196 197
	return daddr;
}

198 199 200
static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
					  struct sk_buff *skb,
					  const void *daddr)
201
{
202 203 204
	struct rt6_info *rt = (struct rt6_info *) dst;
	struct neighbour *n;

205
	daddr = choose_neigh_daddr(rt, skb, daddr);
206
	n = __ipv6_neigh_lookup(dst->dev, daddr);
207 208 209 210 211
	if (n)
		return n;
	return neigh_create(&nd_tbl, daddr, dst->dev);
}

212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
	struct net_device *dev = dst->dev;
	struct rt6_info *rt = (struct rt6_info *)dst;

	daddr = choose_neigh_daddr(rt, NULL, daddr);
	if (!daddr)
		return;
	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
		return;
	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
		return;
	__ipv6_confirm_neigh(dev, daddr);
}

227
static struct dst_ops ip6_dst_ops_template = {
L
Linus Torvalds 已提交
228 229 230 231
	.family			=	AF_INET6,
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
232
	.default_advmss		=	ip6_default_advmss,
233
	.mtu			=	ip6_mtu,
234
	.cow_metrics		=	dst_cow_metrics_generic,
L
Linus Torvalds 已提交
235 236 237 238 239
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
240
	.redirect		=	rt6_do_redirect,
241
	.local_out		=	__ip6_local_out,
242
	.neigh_lookup		=	ip6_neigh_lookup,
243
	.confirm_neigh		=	ip6_confirm_neigh,
L
Linus Torvalds 已提交
244 245
};

246
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
247
{
248 249 250
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
251 252
}

253 254
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
					 struct sk_buff *skb, u32 mtu)
255 256 257
{
}

258 259
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
				      struct sk_buff *skb)
260 261 262
{
}

263 264 265 266
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
267
	.mtu			=	ip6_blackhole_mtu,
268
	.default_advmss		=	ip6_default_advmss,
269
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
270
	.redirect		=	ip6_rt_blackhole_redirect,
271
	.cow_metrics		=	dst_cow_metrics_generic,
272
	.neigh_lookup		=	ip6_neigh_lookup,
273 274
};

275
static const u32 ip6_template_metrics[RTAX_MAX] = {
L
Li RongQing 已提交
276
	[RTAX_HOPLIMIT - 1] = 0,
277 278
};

D
David Ahern 已提交
279 280 281 282 283 284 285 286 287
static const struct rt6_info fib6_null_entry_template = {
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
	.rt6i_protocol  = RTPROT_KERNEL,
	.rt6i_metric	= ~(u32)0,
	.rt6i_ref	= ATOMIC_INIT(1),
	.fib6_type	= RTN_UNREACHABLE,
	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
};

288
static const struct rt6_info ip6_null_entry_template = {
289 290 291
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
292
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
293 294 295
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
L
Linus Torvalds 已提交
296 297
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
298
	.rt6i_protocol  = RTPROT_KERNEL,
L
Linus Torvalds 已提交
299 300
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
301
	.fib6_type	= RTN_UNREACHABLE,
L
Linus Torvalds 已提交
302 303
};

T
Thomas Graf 已提交
304 305
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

306
static const struct rt6_info ip6_prohibit_entry_template = {
307 308 309
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
310
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
311 312 313
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
T
Thomas Graf 已提交
314 315
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
316
	.rt6i_protocol  = RTPROT_KERNEL,
T
Thomas Graf 已提交
317 318
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
319
	.fib6_type	= RTN_PROHIBIT,
T
Thomas Graf 已提交
320 321
};

322
static const struct rt6_info ip6_blk_hole_entry_template = {
323 324 325
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
326
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
327 328
		.error		= -EINVAL,
		.input		= dst_discard,
E
Eric W. Biederman 已提交
329
		.output		= dst_discard_out,
T
Thomas Graf 已提交
330 331
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
332
	.rt6i_protocol  = RTPROT_KERNEL,
T
Thomas Graf 已提交
333 334
	.rt6i_metric	= ~(u32) 0,
	.rt6i_ref	= ATOMIC_INIT(1),
335
	.fib6_type	= RTN_BLACKHOLE,
T
Thomas Graf 已提交
336 337 338 339
};

#endif

340 341 342 343 344 345 346
static void rt6_info_init(struct rt6_info *rt)
{
	struct dst_entry *dst = &rt->dst;

	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
	INIT_LIST_HEAD(&rt->rt6i_siblings);
	INIT_LIST_HEAD(&rt->rt6i_uncached);
347
	rt->fib6_metrics = (struct dst_metrics *)&dst_default_metrics;
348 349
}

L
Linus Torvalds 已提交
350
/* allocate dst with ip6_dst_ops */
M
Martin KaFai Lau 已提交
351 352
static struct rt6_info *__ip6_dst_alloc(struct net *net,
					struct net_device *dev,
353
					int flags)
L
Linus Torvalds 已提交
354
{
355
	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
W
Wei Wang 已提交
356
					1, DST_OBSOLETE_FORCE_CHK, flags);
357

W
Wei Wang 已提交
358
	if (rt) {
359
		rt6_info_init(rt);
W
Wei Wang 已提交
360 361
		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
	}
362

363
	return rt;
L
Linus Torvalds 已提交
364 365
}

366 367 368
struct rt6_info *ip6_dst_alloc(struct net *net,
			       struct net_device *dev,
			       int flags)
M
Martin KaFai Lau 已提交
369
{
370
	struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
M
Martin KaFai Lau 已提交
371 372 373

	if (rt) {
		rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
374
		if (!rt->rt6i_pcpu) {
375
			dst_release_immediate(&rt->dst);
M
Martin KaFai Lau 已提交
376 377 378 379 380 381
			return NULL;
		}
	}

	return rt;
}
382
EXPORT_SYMBOL(ip6_dst_alloc);
M
Martin KaFai Lau 已提交
383

L
Linus Torvalds 已提交
384 385 386
static void ip6_dst_destroy(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
387
	struct rt6_exception_bucket *bucket;
388
	struct rt6_info *from = rt->from;
389
	struct inet6_dev *idev;
390
	struct dst_metrics *m;
L
Linus Torvalds 已提交
391

392
	dst_destroy_metrics_generic(dst);
393
	free_percpu(rt->rt6i_pcpu);
394 395 396
	rt6_uncached_list_del(rt);

	idev = rt->rt6i_idev;
397
	if (idev) {
L
Linus Torvalds 已提交
398 399
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
400
	}
401 402 403 404 405
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
	if (bucket) {
		rt->rt6i_exception_bucket = NULL;
		kfree(bucket);
	}
406

407 408 409 410
	m = rt->fib6_metrics;
	if (m != &dst_default_metrics && refcount_dec_and_test(&m->refcnt))
		kfree(m);

411 412
	rt->from = NULL;
	dst_release(&from->dst);
413 414
}

L
Linus Torvalds 已提交
415 416 417 418 419
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
420
	struct net_device *loopback_dev =
421
		dev_net(dev)->loopback_dev;
L
Linus Torvalds 已提交
422

423 424 425 426 427
	if (idev && idev->dev != loopback_dev) {
		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
		if (loopback_idev) {
			rt->rt6i_idev = loopback_idev;
			in6_dev_put(idev);
428
		}
L
Linus Torvalds 已提交
429 430 431
	}
}

432 433 434 435 436 437 438 439
static bool __rt6_check_expired(const struct rt6_info *rt)
{
	if (rt->rt6i_flags & RTF_EXPIRES)
		return time_after(jiffies, rt->dst.expires);
	else
		return false;
}

440
static bool rt6_check_expired(const struct rt6_info *rt)
L
Linus Torvalds 已提交
441
{
442 443
	if (rt->rt6i_flags & RTF_EXPIRES) {
		if (time_after(jiffies, rt->dst.expires))
444
			return true;
445
	} else if (rt->from) {
446
		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
447
			fib6_check_expired(rt->from);
448
	}
449
	return false;
L
Linus Torvalds 已提交
450 451
}

452 453
static struct rt6_info *rt6_multipath_select(const struct net *net,
					     struct rt6_info *match,
454
					     struct flowi6 *fl6, int oif,
D
David Ahern 已提交
455
					     const struct sk_buff *skb,
456
					     int strict)
457 458 459
{
	struct rt6_info *sibling, *next_sibling;

460 461 462 463
	/* We might have already computed the hash for ICMPv6 errors. In such
	 * case it will always be non-zero. Otherwise now is the time to do it.
	 */
	if (!fl6->mp_hash)
464
		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
465

466
	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
467 468 469 470
		return match;

	list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
				 rt6i_siblings) {
471 472 473 474
		int nh_upper_bound;

		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
		if (fl6->mp_hash > nh_upper_bound)
475 476 477 478 479 480 481
			continue;
		if (rt6_score_route(sibling, oif, strict) < 0)
			break;
		match = sibling;
		break;
	}

482 483 484
	return match;
}

L
Linus Torvalds 已提交
485
/*
486
 *	Route lookup. rcu_read_lock() should be held.
L
Linus Torvalds 已提交
487 488
 */

489 490
static inline struct rt6_info *rt6_device_match(struct net *net,
						    struct rt6_info *rt,
491
						    const struct in6_addr *saddr,
L
Linus Torvalds 已提交
492
						    int oif,
493
						    int flags)
L
Linus Torvalds 已提交
494 495 496 497
{
	struct rt6_info *local = NULL;
	struct rt6_info *sprt;

498 499
	if (!oif && ipv6_addr_any(saddr) &&
	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
500
		return rt;
501

502
	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
503
		const struct net_device *dev = sprt->fib6_nh.nh_dev;
504

505
		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
506 507
			continue;

508
		if (oif) {
L
Linus Torvalds 已提交
509 510 511
			if (dev->ifindex == oif)
				return sprt;
			if (dev->flags & IFF_LOOPBACK) {
512
				if (!sprt->rt6i_idev ||
L
Linus Torvalds 已提交
513
				    sprt->rt6i_idev->dev->ifindex != oif) {
514
					if (flags & RT6_LOOKUP_F_IFACE)
L
Linus Torvalds 已提交
515
						continue;
516 517
					if (local &&
					    local->rt6i_idev->dev->ifindex == oif)
L
Linus Torvalds 已提交
518 519 520 521
						continue;
				}
				local = sprt;
			}
522 523 524 525
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
L
Linus Torvalds 已提交
526
		}
527
	}
L
Linus Torvalds 已提交
528

529
	if (oif) {
L
Linus Torvalds 已提交
530 531 532
		if (local)
			return local;

533
		if (flags & RT6_LOOKUP_F_IFACE)
D
David Ahern 已提交
534
			return net->ipv6.fib6_null_entry;
L
Linus Torvalds 已提交
535
	}
536

D
David Ahern 已提交
537
	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
L
Linus Torvalds 已提交
538 539
}

540
#ifdef CONFIG_IPV6_ROUTER_PREF
541 542 543 544 545 546 547 548 549 550 551 552 553
struct __rt6_probe_work {
	struct work_struct work;
	struct in6_addr target;
	struct net_device *dev;
};

static void rt6_probe_deferred(struct work_struct *w)
{
	struct in6_addr mcaddr;
	struct __rt6_probe_work *work =
		container_of(w, struct __rt6_probe_work, work);

	addrconf_addr_solict_mult(&work->target, &mcaddr);
554
	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
555
	dev_put(work->dev);
556
	kfree(work);
557 558
}

559 560
static void rt6_probe(struct rt6_info *rt)
{
561
	struct __rt6_probe_work *work;
562
	const struct in6_addr *nh_gw;
563
	struct neighbour *neigh;
564 565
	struct net_device *dev;

566 567 568 569 570 571 572 573
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
574
	if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
575
		return;
576 577 578

	nh_gw = &rt->fib6_nh.nh_gw;
	dev = rt->fib6_nh.nh_dev;
579
	rcu_read_lock_bh();
580
	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
581
	if (neigh) {
582 583 584
		if (neigh->nud_state & NUD_VALID)
			goto out;

585
		work = NULL;
586
		write_lock(&neigh->lock);
587 588 589 590 591 592 593
		if (!(neigh->nud_state & NUD_VALID) &&
		    time_after(jiffies,
			       neigh->updated +
			       rt->rt6i_idev->cnf.rtr_probe_interval)) {
			work = kmalloc(sizeof(*work), GFP_ATOMIC);
			if (work)
				__neigh_set_probe_once(neigh);
594
		}
595
		write_unlock(&neigh->lock);
596 597
	} else {
		work = kmalloc(sizeof(*work), GFP_ATOMIC);
598
	}
599 600 601

	if (work) {
		INIT_WORK(&work->work, rt6_probe_deferred);
602 603 604
		work->target = *nh_gw;
		dev_hold(dev);
		work->dev = dev;
605 606 607
		schedule_work(&work->work);
	}

608
out:
609
	rcu_read_unlock_bh();
610 611 612 613 614 615 616
}
#else
static inline void rt6_probe(struct rt6_info *rt)
{
}
#endif

L
Linus Torvalds 已提交
617
/*
618
 * Default Router Selection (RFC 2461 6.3.6)
L
Linus Torvalds 已提交
619
 */
D
Dave Jones 已提交
620
static inline int rt6_check_dev(struct rt6_info *rt, int oif)
621
{
622 623
	const struct net_device *dev = rt->fib6_nh.nh_dev;

624
	if (!oif || dev->ifindex == oif)
625
		return 2;
626 627 628 629
	if ((dev->flags & IFF_LOOPBACK) &&
	    rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
		return 1;
	return 0;
630
}
L
Linus Torvalds 已提交
631

632
static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
L
Linus Torvalds 已提交
633
{
634
	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
635
	struct neighbour *neigh;
636

637 638
	if (rt->rt6i_flags & RTF_NONEXTHOP ||
	    !(rt->rt6i_flags & RTF_GATEWAY))
639
		return RT6_NUD_SUCCEED;
640 641

	rcu_read_lock_bh();
642 643
	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
					  &rt->fib6_nh.nh_gw);
644 645
	if (neigh) {
		read_lock(&neigh->lock);
646
		if (neigh->nud_state & NUD_VALID)
647
			ret = RT6_NUD_SUCCEED;
648
#ifdef CONFIG_IPV6_ROUTER_PREF
649
		else if (!(neigh->nud_state & NUD_FAILED))
650
			ret = RT6_NUD_SUCCEED;
J
Jiri Benc 已提交
651 652
		else
			ret = RT6_NUD_FAIL_PROBE;
653
#endif
654
		read_unlock(&neigh->lock);
655 656
	} else {
		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
J
Jiri Benc 已提交
657
		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
658
	}
659 660
	rcu_read_unlock_bh();

661
	return ret;
L
Linus Torvalds 已提交
662 663
}

664 665
static int rt6_score_route(struct rt6_info *rt, int oif,
			   int strict)
L
Linus Torvalds 已提交
666
{
667
	int m;
668

669
	m = rt6_check_dev(rt, oif);
670
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
671
		return RT6_NUD_FAIL_HARD;
672 673 674
#ifdef CONFIG_IPV6_ROUTER_PREF
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
#endif
675 676 677 678 679
	if (strict & RT6_LOOKUP_F_REACHABLE) {
		int n = rt6_check_neigh(rt);
		if (n < 0)
			return n;
	}
680 681 682
	return m;
}

683
static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
684 685
				   int *mpri, struct rt6_info *match,
				   bool *do_rr)
686
{
687
	int m;
688
	bool match_do_rr = false;
689 690
	struct inet6_dev *idev = rt->rt6i_idev;

691
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
692 693
		goto out;

694
	if (idev->cnf.ignore_routes_with_linkdown &&
695
	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
696
	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
697
		goto out;
698

699
	if (fib6_check_expired(rt))
700 701 702
		goto out;

	m = rt6_score_route(rt, oif, strict);
J
Jiri Benc 已提交
703
	if (m == RT6_NUD_FAIL_DO_RR) {
704 705
		match_do_rr = true;
		m = 0; /* lowest valid score */
J
Jiri Benc 已提交
706
	} else if (m == RT6_NUD_FAIL_HARD) {
707
		goto out;
708 709 710 711
	}

	if (strict & RT6_LOOKUP_F_REACHABLE)
		rt6_probe(rt);
712

J
Jiri Benc 已提交
713
	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
714
	if (m > *mpri) {
715
		*do_rr = match_do_rr;
716 717 718 719 720 721 722 723
		*mpri = m;
		match = rt;
	}
out:
	return match;
}

static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
W
Wei Wang 已提交
724
				     struct rt6_info *leaf,
725
				     struct rt6_info *rr_head,
726 727
				     u32 metric, int oif, int strict,
				     bool *do_rr)
728
{
729
	struct rt6_info *rt, *match, *cont;
730
	int mpri = -1;
L
Linus Torvalds 已提交
731

732
	match = NULL;
733
	cont = NULL;
734
	for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
735 736 737 738 739 740 741 742
		if (rt->rt6i_metric != metric) {
			cont = rt;
			break;
		}

		match = find_match(rt, oif, strict, &mpri, match, do_rr);
	}

743
	for (rt = leaf; rt && rt != rr_head;
744
	     rt = rcu_dereference(rt->rt6_next)) {
745 746 747 748 749
		if (rt->rt6i_metric != metric) {
			cont = rt;
			break;
		}

750
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
751 752 753 754 755
	}

	if (match || !cont)
		return match;

756
	for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
757
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
L
Linus Torvalds 已提交
758

759 760
	return match;
}
L
Linus Torvalds 已提交
761

W
Wei Wang 已提交
762 763
static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
				   int oif, int strict)
764
{
765
	struct rt6_info *leaf = rcu_dereference(fn->leaf);
766
	struct rt6_info *match, *rt0;
767
	bool do_rr = false;
768
	int key_plen;
L
Linus Torvalds 已提交
769

D
David Ahern 已提交
770 771
	if (!leaf || leaf == net->ipv6.fib6_null_entry)
		return net->ipv6.fib6_null_entry;
W
Wei Wang 已提交
772

773
	rt0 = rcu_dereference(fn->rr_ptr);
774
	if (!rt0)
775
		rt0 = leaf;
L
Linus Torvalds 已提交
776

777 778 779 780 781 782 783 784 785 786 787
	/* Double check to make sure fn is not an intermediate node
	 * and fn->leaf does not points to its child's leaf
	 * (This might happen if all routes under fn are deleted from
	 * the tree and fib6_repair_tree() is called on the node.)
	 */
	key_plen = rt0->rt6i_dst.plen;
#ifdef CONFIG_IPV6_SUBTREES
	if (rt0->rt6i_src.plen)
		key_plen = rt0->rt6i_src.plen;
#endif
	if (fn->fn_bit != key_plen)
D
David Ahern 已提交
788
		return net->ipv6.fib6_null_entry;
789

W
Wei Wang 已提交
790
	match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
791
			     &do_rr);
L
Linus Torvalds 已提交
792

793
	if (do_rr) {
794
		struct rt6_info *next = rcu_dereference(rt0->rt6_next);
795

796
		/* no entries matched; do round-robin */
797
		if (!next || next->rt6i_metric != rt0->rt6i_metric)
W
Wei Wang 已提交
798
			next = leaf;
799

800 801 802 803 804 805 806
		if (next != rt0) {
			spin_lock_bh(&leaf->rt6i_table->tb6_lock);
			/* make sure next is not being deleted from the tree */
			if (next->rt6i_node)
				rcu_assign_pointer(fn->rr_ptr, next);
			spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
		}
L
Linus Torvalds 已提交
807 808
	}

D
David Ahern 已提交
809
	return match ? match : net->ipv6.fib6_null_entry;
L
Linus Torvalds 已提交
810 811
}

812 813 814 815 816
static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
{
	return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
}

817 818
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
819
		  const struct in6_addr *gwaddr)
820
{
821
	struct net *net = dev_net(dev);
822 823 824
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
825
	unsigned long lifetime;
826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848
	struct rt6_info *rt;

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
849
		return -EINVAL;
850

851
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
852 853 854 855 856 857 858 859 860 861 862

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

863
	if (rinfo->prefix_len == 0)
864
		rt = rt6_get_dflt_router(net, gwaddr, dev);
865 866
	else
		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
867
					gwaddr, dev);
868 869

	if (rt && !lifetime) {
870
		ip6_del_rt(net, rt);
871 872 873 874
		rt = NULL;
	}

	if (!rt && lifetime)
875 876
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
					dev, pref);
877 878 879 880 881
	else if (rt)
		rt->rt6i_flags = RTF_ROUTEINFO |
				 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);

	if (rt) {
882
		if (!addrconf_finite_timeout(lifetime))
883
			fib6_clean_expires(rt);
884
		else
885
			fib6_set_expires(rt, jiffies + HZ * lifetime);
886

A
Amerigo Wang 已提交
887
		ip6_rt_put(rt);
888 889 890 891 892
	}
	return 0;
}
#endif

893 894 895 896 897 898 899
/*
 *	Misc support functions
 */

/* called with rcu_lock held */
static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
{
900
	struct net_device *dev = rt->fib6_nh.nh_dev;
901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919

	if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
		/* for copies of local routes, dst->dev needs to be the
		 * device if it is a master device, the master device if
		 * device is enslaved, and the loopback as the default
		 */
		if (netif_is_l3_slave(dev) &&
		    !rt6_need_strict(&rt->rt6i_dst.addr))
			dev = l3mdev_master_dev_rcu(dev);
		else if (!netif_is_l3_master(dev))
			dev = dev_net(dev)->loopback_dev;
		/* last case is netif_is_l3_master(dev) is true in which
		 * case we want dev returned to be dev
		 */
	}

	return dev;
}

920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988
static const int fib6_prop[RTN_MAX + 1] = {
	[RTN_UNSPEC]	= 0,
	[RTN_UNICAST]	= 0,
	[RTN_LOCAL]	= 0,
	[RTN_BROADCAST]	= 0,
	[RTN_ANYCAST]	= 0,
	[RTN_MULTICAST]	= 0,
	[RTN_BLACKHOLE]	= -EINVAL,
	[RTN_UNREACHABLE] = -EHOSTUNREACH,
	[RTN_PROHIBIT]	= -EACCES,
	[RTN_THROW]	= -EAGAIN,
	[RTN_NAT]	= -EINVAL,
	[RTN_XRESOLVE]	= -EINVAL,
};

static int ip6_rt_type_to_error(u8 fib6_type)
{
	return fib6_prop[fib6_type];
}

static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct rt6_info *ort)
{
	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);

	switch (ort->fib6_type) {
	case RTN_BLACKHOLE:
		rt->dst.output = dst_discard_out;
		rt->dst.input = dst_discard;
		break;
	case RTN_PROHIBIT:
		rt->dst.output = ip6_pkt_prohibit_out;
		rt->dst.input = ip6_pkt_prohibit;
		break;
	case RTN_THROW:
	case RTN_UNREACHABLE:
	default:
		rt->dst.output = ip6_pkt_discard_out;
		rt->dst.input = ip6_pkt_discard;
		break;
	}
}

static void ip6_rt_init_dst(struct rt6_info *rt, struct rt6_info *ort)
{
	if (ort->rt6i_flags & RTF_REJECT) {
		ip6_rt_init_dst_reject(rt, ort);
		return;
	}

	rt->dst.error = 0;
	rt->dst.output = ip6_output;

	if (ort->fib6_type == RTN_LOCAL) {
		rt->dst.flags |= DST_HOST;
		rt->dst.input = ip6_input;
	} else if (ipv6_addr_type(&ort->rt6i_dst.addr) & IPV6_ADDR_MULTICAST) {
		rt->dst.input = ip6_mc_input;
	} else {
		rt->dst.input = ip6_forward;
	}

	if (ort->fib6_nh.nh_lwtstate) {
		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
		lwtunnel_set_redirect(&rt->dst);
	}

	rt->dst.lastuse = jiffies;
}

989 990 991 992 993 994 995
static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
{
	BUG_ON(from->from);

	rt->rt6i_flags &= ~RTF_EXPIRES;
	dst_hold(&from->dst);
	rt->from = from;
996 997 998 999 1000
	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
	if (from->fib6_metrics != &dst_default_metrics) {
		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
		refcount_inc(&from->fib6_metrics->refcnt);
	}
1001 1002 1003 1004
}

static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
{
1005 1006
	ip6_rt_init_dst(rt, ort);

1007 1008 1009 1010
	rt->rt6i_dst = ort->rt6i_dst;
	rt->rt6i_idev = ort->rt6i_idev;
	if (rt->rt6i_idev)
		in6_dev_hold(rt->rt6i_idev);
1011
	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
1012 1013 1014 1015 1016 1017 1018 1019
	rt->rt6i_flags = ort->rt6i_flags;
	rt6_set_from(rt, ort);
	rt->rt6i_metric = ort->rt6i_metric;
#ifdef CONFIG_IPV6_SUBTREES
	rt->rt6i_src = ort->rt6i_src;
#endif
	rt->rt6i_prefsrc = ort->rt6i_prefsrc;
	rt->rt6i_table = ort->rt6i_table;
1020
	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
1021 1022
}

M
Martin KaFai Lau 已提交
1023 1024 1025
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
					struct in6_addr *saddr)
{
1026
	struct fib6_node *pn, *sn;
M
Martin KaFai Lau 已提交
1027 1028 1029
	while (1) {
		if (fn->fn_flags & RTN_TL_ROOT)
			return NULL;
1030 1031 1032 1033
		pn = rcu_dereference(fn->parent);
		sn = FIB6_SUBTREE(pn);
		if (sn && sn != fn)
			fn = fib6_lookup(sn, NULL, saddr);
M
Martin KaFai Lau 已提交
1034 1035 1036 1037 1038 1039
		else
			fn = pn;
		if (fn->fn_flags & RTN_RTINFO)
			return fn;
	}
}
T
Thomas Graf 已提交
1040

1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
			  bool null_fallback)
{
	struct rt6_info *rt = *prt;

	if (dst_hold_safe(&rt->dst))
		return true;
	if (null_fallback) {
		rt = net->ipv6.ip6_null_entry;
		dst_hold(&rt->dst);
	} else {
		rt = NULL;
	}
	*prt = rt;
	return false;
}

1058 1059
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
D
David Ahern 已提交
1060 1061 1062
					     struct flowi6 *fl6,
					     const struct sk_buff *skb,
					     int flags)
L
Linus Torvalds 已提交
1063
{
1064
	struct rt6_info *rt, *rt_cache;
L
Linus Torvalds 已提交
1065 1066
	struct fib6_node *fn;

1067 1068 1069
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		flags &= ~RT6_LOOKUP_F_IFACE;

1070
	rcu_read_lock();
1071
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
T
Thomas Graf 已提交
1072
restart:
1073 1074
	rt = rcu_dereference(fn->leaf);
	if (!rt) {
D
David Ahern 已提交
1075
		rt = net->ipv6.fib6_null_entry;
1076 1077 1078 1079
	} else {
		rt = rt6_device_match(net, rt, &fl6->saddr,
				      fl6->flowi6_oif, flags);
		if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
1080
			rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
D
David Ahern 已提交
1081
						  skb, flags);
1082
	}
D
David Ahern 已提交
1083
	if (rt == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
1084 1085 1086 1087
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
	}
1088 1089 1090 1091 1092
	/* Search through exception table */
	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
	if (rt_cache)
		rt = rt_cache;

1093 1094 1095
	if (ip6_hold_safe(net, &rt, true))
		dst_use_noref(&rt->dst, jiffies);

1096
	rcu_read_unlock();
D
David Ahern 已提交
1097

1098
	trace_fib6_table_lookup(net, rt, table, fl6);
D
David Ahern 已提交
1099

T
Thomas Graf 已提交
1100 1101 1102 1103
	return rt;

}

1104
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
D
David Ahern 已提交
1105
				   const struct sk_buff *skb, int flags)
F
Florian Westphal 已提交
1106
{
D
David Ahern 已提交
1107
	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
F
Florian Westphal 已提交
1108 1109 1110
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

1111
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
D
David Ahern 已提交
1112 1113
			    const struct in6_addr *saddr, int oif,
			    const struct sk_buff *skb, int strict)
T
Thomas Graf 已提交
1114
{
1115 1116 1117
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
T
Thomas Graf 已提交
1118 1119
	};
	struct dst_entry *dst;
1120
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
T
Thomas Graf 已提交
1121

1122
	if (saddr) {
1123
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1124 1125 1126
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

D
David Ahern 已提交
1127
	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
T
Thomas Graf 已提交
1128 1129 1130 1131 1132
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

L
Linus Torvalds 已提交
1133 1134
	return NULL;
}
1135 1136
EXPORT_SYMBOL(rt6_lookup);

T
Thomas Graf 已提交
1137
/* ip6_ins_rt is called with FREE table->tb6_lock.
1138 1139 1140
 * It takes new route entry, the addition fails by any reason the
 * route is released.
 * Caller must hold dst before calling it.
L
Linus Torvalds 已提交
1141 1142
 */

1143
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
1144
			struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
1145 1146
{
	int err;
T
Thomas Graf 已提交
1147
	struct fib6_table *table;
L
Linus Torvalds 已提交
1148

T
Thomas Graf 已提交
1149
	table = rt->rt6i_table;
1150
	spin_lock_bh(&table->tb6_lock);
1151
	err = fib6_add(&table->tb6_root, rt, info, extack);
1152
	spin_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1153 1154 1155 1156

	return err;
}

1157
int ip6_ins_rt(struct net *net, struct rt6_info *rt)
1158
{
1159
	struct nl_info info = {	.nl_net = net, };
1160

1161 1162
	/* Hold dst to account for the reference from the fib6 tree */
	dst_hold(&rt->dst);
1163
	return __ip6_ins_rt(rt, &info, NULL);
1164 1165
}

1166 1167 1168
static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
					   const struct in6_addr *daddr,
					   const struct in6_addr *saddr)
L
Linus Torvalds 已提交
1169
{
1170
	struct net_device *dev;
L
Linus Torvalds 已提交
1171 1172 1173 1174 1175 1176
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

M
Martin KaFai Lau 已提交
1177
	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1178
		ort = ort->from;
L
Linus Torvalds 已提交
1179

1180 1181 1182 1183
	rcu_read_lock();
	dev = ip6_rt_get_dev_rcu(ort);
	rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
	rcu_read_unlock();
M
Martin KaFai Lau 已提交
1184 1185 1186 1187 1188 1189 1190 1191 1192
	if (!rt)
		return NULL;

	ip6_rt_copy_init(rt, ort);
	rt->rt6i_flags |= RTF_CACHE;
	rt->rt6i_metric = 0;
	rt->dst.flags |= DST_HOST;
	rt->rt6i_dst.addr = *daddr;
	rt->rt6i_dst.plen = 128;
L
Linus Torvalds 已提交
1193

M
Martin KaFai Lau 已提交
1194 1195 1196 1197
	if (!rt6_is_gw_or_nonexthop(ort)) {
		if (ort->rt6i_dst.plen != 128 &&
		    ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
			rt->rt6i_flags |= RTF_ANYCAST;
L
Linus Torvalds 已提交
1198
#ifdef CONFIG_IPV6_SUBTREES
M
Martin KaFai Lau 已提交
1199 1200 1201
		if (rt->rt6i_src.plen && saddr) {
			rt->rt6i_src.addr = *saddr;
			rt->rt6i_src.plen = 128;
1202
		}
M
Martin KaFai Lau 已提交
1203
#endif
1204
	}
L
Linus Torvalds 已提交
1205

1206 1207
	return rt;
}
L
Linus Torvalds 已提交
1208

M
Martin KaFai Lau 已提交
1209 1210
static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
{
1211
	struct net_device *dev;
M
Martin KaFai Lau 已提交
1212 1213
	struct rt6_info *pcpu_rt;

1214 1215 1216 1217
	rcu_read_lock();
	dev = ip6_rt_get_dev_rcu(rt);
	pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
	rcu_read_unlock();
M
Martin KaFai Lau 已提交
1218 1219 1220 1221 1222 1223 1224 1225
	if (!pcpu_rt)
		return NULL;
	ip6_rt_copy_init(pcpu_rt, rt);
	pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
	pcpu_rt->rt6i_flags |= RTF_PCPU;
	return pcpu_rt;
}

1226
/* It should be called with rcu_read_lock() acquired */
M
Martin KaFai Lau 已提交
1227 1228
static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
{
1229
	struct rt6_info *pcpu_rt, **p;
M
Martin KaFai Lau 已提交
1230 1231 1232 1233

	p = this_cpu_ptr(rt->rt6i_pcpu);
	pcpu_rt = *p;

1234 1235
	if (pcpu_rt)
		ip6_hold_safe(NULL, &pcpu_rt, false);
1236

1237 1238 1239
	return pcpu_rt;
}

1240 1241
static struct rt6_info *rt6_make_pcpu_route(struct net *net,
					    struct rt6_info *rt)
1242 1243
{
	struct rt6_info *pcpu_rt, *prev, **p;
M
Martin KaFai Lau 已提交
1244 1245 1246

	pcpu_rt = ip6_rt_pcpu_alloc(rt);
	if (!pcpu_rt) {
1247 1248
		dst_hold(&net->ipv6.ip6_null_entry->dst);
		return net->ipv6.ip6_null_entry;
M
Martin KaFai Lau 已提交
1249 1250
	}

1251 1252 1253
	dst_hold(&pcpu_rt->dst);
	p = this_cpu_ptr(rt->rt6i_pcpu);
	prev = cmpxchg(p, NULL, pcpu_rt);
1254
	BUG_ON(prev);
1255

M
Martin KaFai Lau 已提交
1256 1257 1258
	return pcpu_rt;
}

1259 1260 1261 1262 1263 1264 1265 1266 1267 1268
/* exception hash table implementation
 */
static DEFINE_SPINLOCK(rt6_exception_lock);

/* Remove rt6_ex from hash table and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
				 struct rt6_exception *rt6_ex)
{
1269
	struct net *net;
W
Wei Wang 已提交
1270

1271 1272
	if (!bucket || !rt6_ex)
		return;
1273 1274

	net = dev_net(rt6_ex->rt6i->dst.dev);
1275 1276 1277 1278 1279 1280
	rt6_ex->rt6i->rt6i_node = NULL;
	hlist_del_rcu(&rt6_ex->hlist);
	rt6_release(rt6_ex->rt6i);
	kfree_rcu(rt6_ex, rcu);
	WARN_ON_ONCE(!bucket->depth);
	bucket->depth--;
W
Wei Wang 已提交
1281
	net->ipv6.rt6_stats->fib_rt_cache--;
1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384
}

/* Remove oldest rt6_ex in bucket and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
{
	struct rt6_exception *rt6_ex, *oldest = NULL;

	if (!bucket)
		return;

	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
			oldest = rt6_ex;
	}
	rt6_remove_exception(bucket, oldest);
}

static u32 rt6_exception_hash(const struct in6_addr *dst,
			      const struct in6_addr *src)
{
	static u32 seed __read_mostly;
	u32 val;

	net_get_random_once(&seed, sizeof(seed));
	val = jhash(dst, sizeof(*dst), seed);

#ifdef CONFIG_IPV6_SUBTREES
	if (src)
		val = jhash(src, sizeof(*src), val);
#endif
	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rt6_exception_lock
 */
static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
			      const struct in6_addr *daddr,
			      const struct in6_addr *saddr)
{
	struct rt6_exception *rt6_ex;
	u32 hval;

	if (!(*bucket) || !daddr)
		return NULL;

	hval = rt6_exception_hash(daddr, saddr);
	*bucket += hval;

	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
		struct rt6_info *rt6 = rt6_ex->rt6i;
		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
		if (matched && saddr)
			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
		if (matched)
			return rt6_ex;
	}
	return NULL;
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rcu_read_lock()
 */
static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
			 const struct in6_addr *daddr,
			 const struct in6_addr *saddr)
{
	struct rt6_exception *rt6_ex;
	u32 hval;

	WARN_ON_ONCE(!rcu_read_lock_held());

	if (!(*bucket) || !daddr)
		return NULL;

	hval = rt6_exception_hash(daddr, saddr);
	*bucket += hval;

	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
		struct rt6_info *rt6 = rt6_ex->rt6i;
		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
		if (matched && saddr)
			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
		if (matched)
			return rt6_ex;
	}
	return NULL;
}

1385 1386 1387 1388 1389 1390 1391 1392 1393 1394
static unsigned int fib6_mtu(const struct rt6_info *rt)
{
	unsigned int mtu;

	mtu = rt->fib6_pmtu ? : rt->rt6i_idev->cnf.mtu6;
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
}

1395 1396 1397
static int rt6_insert_exception(struct rt6_info *nrt,
				struct rt6_info *ort)
{
1398
	struct net *net = dev_net(nrt->dst.dev);
1399 1400 1401 1402 1403 1404 1405
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	int err = 0;

	/* ort can't be a cache or pcpu route */
	if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
1406
		ort = ort->from;
1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437
	WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));

	spin_lock_bh(&rt6_exception_lock);

	if (ort->exception_bucket_flushed) {
		err = -EINVAL;
		goto out;
	}

	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));
	if (!bucket) {
		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
				 GFP_ATOMIC);
		if (!bucket) {
			err = -ENOMEM;
			goto out;
		}
		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
	}

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates ort is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
	if (ort->rt6i_src.plen)
		src_key = &nrt->rt6i_src.addr;
#endif
1438 1439 1440 1441 1442

	/* Update rt6i_prefsrc as it could be changed
	 * in rt6_remove_prefsrc()
	 */
	nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
1443 1444 1445 1446
	/* rt6_mtu_change() might lower mtu on ort.
	 * Only insert this exception route if its mtu
	 * is less than ort's mtu value.
	 */
1447
	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1448 1449 1450
		err = -EINVAL;
		goto out;
	}
1451

1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467
	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
					       src_key);
	if (rt6_ex)
		rt6_remove_exception(bucket, rt6_ex);

	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
	if (!rt6_ex) {
		err = -ENOMEM;
		goto out;
	}
	rt6_ex->rt6i = nrt;
	rt6_ex->stamp = jiffies;
	atomic_inc(&nrt->rt6i_ref);
	nrt->rt6i_node = ort->rt6i_node;
	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
	bucket->depth++;
W
Wei Wang 已提交
1468
	net->ipv6.rt6_stats->fib_rt_cache++;
1469 1470 1471 1472 1473 1474 1475 1476

	if (bucket->depth > FIB6_MAX_DEPTH)
		rt6_exception_remove_oldest(bucket);

out:
	spin_unlock_bh(&rt6_exception_lock);

	/* Update fn->fn_sernum to invalidate all cached dst */
1477
	if (!err) {
1478
		spin_lock_bh(&ort->rt6i_table->tb6_lock);
1479
		fib6_update_sernum(net, ort);
1480
		spin_unlock_bh(&ort->rt6i_table->tb6_lock);
1481 1482
		fib6_force_start_gc(net);
	}
1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549

	return err;
}

void rt6_flush_exceptions(struct rt6_info *rt)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	spin_lock_bh(&rt6_exception_lock);
	/* Prevent rt6_insert_exception() to recreate the bucket list */
	rt->exception_bucket_flushed = 1;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));
	if (!bucket)
		goto out;

	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
			rt6_remove_exception(bucket, rt6_ex);
		WARN_ON_ONCE(bucket->depth);
		bucket++;
	}

out:
	spin_unlock_bh(&rt6_exception_lock);
}

/* Find cached rt in the hash table inside passed in rt
 * Caller has to hold rcu_read_lock()
 */
static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
					   struct in6_addr *daddr,
					   struct in6_addr *saddr)
{
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	struct rt6_info *res = NULL;

	bucket = rcu_dereference(rt->rt6i_exception_bucket);

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates rt is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
	if (rt->rt6i_src.plen)
		src_key = saddr;
#endif
	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);

	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
		res = rt6_ex->rt6i;

	return res;
}

/* Remove the passed in cached rt from the hash table that contains it */
int rt6_remove_exception_rt(struct rt6_info *rt)
{
	struct rt6_exception_bucket *bucket;
1550
	struct rt6_info *from = rt->from;
1551 1552 1553 1554 1555
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	int err;

	if (!from ||
1556
	    !(rt->rt6i_flags & RTF_CACHE))
1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594
		return -EINVAL;

	if (!rcu_access_pointer(from->rt6i_exception_bucket))
		return -ENOENT;

	spin_lock_bh(&rt6_exception_lock);
	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));
#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates 'from' is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
	if (from->rt6i_src.plen)
		src_key = &rt->rt6i_src.addr;
#endif
	rt6_ex = __rt6_find_exception_spinlock(&bucket,
					       &rt->rt6i_dst.addr,
					       src_key);
	if (rt6_ex) {
		rt6_remove_exception(bucket, rt6_ex);
		err = 0;
	} else {
		err = -ENOENT;
	}

	spin_unlock_bh(&rt6_exception_lock);
	return err;
}

/* Find rt6_ex which contains the passed in rt cache and
 * refresh its stamp
 */
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
	struct rt6_exception_bucket *bucket;
1595
	struct rt6_info *from = rt->from;
1596 1597 1598 1599
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;

	if (!from ||
1600
	    !(rt->rt6i_flags & RTF_CACHE))
1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624
		return;

	rcu_read_lock();
	bucket = rcu_dereference(from->rt6i_exception_bucket);

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates 'from' is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
	if (from->rt6i_src.plen)
		src_key = &rt->rt6i_src.addr;
#endif
	rt6_ex = __rt6_find_exception_rcu(&bucket,
					  &rt->rt6i_dst.addr,
					  src_key);
	if (rt6_ex)
		rt6_ex->stamp = jiffies;

	rcu_read_unlock();
}

1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643
static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	int i;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
			}
			bucket++;
		}
	}
}

1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667
static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
					 struct rt6_info *rt, int mtu)
{
	/* If the new MTU is lower than the route PMTU, this new MTU will be the
	 * lowest MTU in the path: always allow updating the route PMTU to
	 * reflect PMTU decreases.
	 *
	 * If the new MTU is higher, and the route PMTU is equal to the local
	 * MTU, this means the old MTU is the lowest in the path, so allow
	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
	 * handle this.
	 */

	if (dst_mtu(&rt->dst) >= mtu)
		return true;

	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
		return true;

	return false;
}

static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
				       struct rt6_info *rt, int mtu)
1668 1669 1670 1671 1672 1673 1674 1675
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	int i;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));

1676 1677 1678 1679 1680 1681 1682 1683
	if (!bucket)
		return;

	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
			struct rt6_info *entry = rt6_ex->rt6i;

			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1684
			 * route), the metrics of its rt->from have already
1685 1686
			 * been updated.
			 */
1687
			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1688
			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1689
				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1690
		}
1691
		bucket++;
1692 1693 1694
	}
}

1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731
#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)

static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
					struct in6_addr *gateway)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
		return;

	spin_lock_bh(&rt6_exception_lock);
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				     lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry_safe(rt6_ex, tmp,
						  &bucket->chain, hlist) {
				struct rt6_info *entry = rt6_ex->rt6i;

				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
				    RTF_CACHE_GATEWAY &&
				    ipv6_addr_equal(gateway,
						    &entry->rt6i_gateway)) {
					rt6_remove_exception(bucket, rt6_ex);
				}
			}
			bucket++;
		}
	}

	spin_unlock_bh(&rt6_exception_lock);
}

1732 1733 1734 1735 1736 1737 1738
static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
				      struct rt6_exception *rt6_ex,
				      struct fib6_gc_args *gc_args,
				      unsigned long now)
{
	struct rt6_info *rt = rt6_ex->rt6i;

1739 1740 1741 1742 1743 1744
	/* we are pruning and obsoleting aged-out and non gateway exceptions
	 * even if others have still references to them, so that on next
	 * dst_check() such references can be dropped.
	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
	 * expired, independently from their aging, as per RFC 8201 section 4
	 */
W
Wei Wang 已提交
1745 1746 1747 1748 1749 1750 1751 1752
	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
			RT6_TRACE("aging clone %p\n", rt);
			rt6_remove_exception(bucket, rt6_ex);
			return;
		}
	} else if (time_after(jiffies, rt->dst.expires)) {
		RT6_TRACE("purging expired route %p\n", rt);
1753 1754
		rt6_remove_exception(bucket, rt6_ex);
		return;
W
Wei Wang 已提交
1755 1756 1757
	}

	if (rt->rt6i_flags & RTF_GATEWAY) {
1758 1759 1760
		struct neighbour *neigh;
		__u8 neigh_flags = 0;

1761 1762
		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
		if (neigh)
1763
			neigh_flags = neigh->flags;
1764

1765 1766 1767 1768 1769 1770 1771
		if (!(neigh_flags & NTF_ROUTER)) {
			RT6_TRACE("purging route %p via non-router but gateway\n",
				  rt);
			rt6_remove_exception(bucket, rt6_ex);
			return;
		}
	}
W
Wei Wang 已提交
1772

1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787
	gc_args->more++;
}

void rt6_age_exceptions(struct rt6_info *rt,
			struct fib6_gc_args *gc_args,
			unsigned long now)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
		return;

1788 1789
	rcu_read_lock_bh();
	spin_lock(&rt6_exception_lock);
1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry_safe(rt6_ex, tmp,
						  &bucket->chain, hlist) {
				rt6_age_examine_exception(bucket, rt6_ex,
							  gc_args, now);
			}
			bucket++;
		}
	}
1803 1804
	spin_unlock(&rt6_exception_lock);
	rcu_read_unlock_bh();
1805 1806
}

1807
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
D
David Ahern 已提交
1808 1809
			       int oif, struct flowi6 *fl6,
			       const struct sk_buff *skb, int flags)
L
Linus Torvalds 已提交
1810
{
1811
	struct fib6_node *fn, *saved_fn;
1812
	struct rt6_info *rt, *rt_cache;
T
Thomas Graf 已提交
1813
	int strict = 0;
L
Linus Torvalds 已提交
1814

1815
	strict |= flags & RT6_LOOKUP_F_IFACE;
1816
	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1817 1818
	if (net->ipv6.devconf_all->forwarding == 0)
		strict |= RT6_LOOKUP_F_REACHABLE;
L
Linus Torvalds 已提交
1819

1820
	rcu_read_lock();
L
Linus Torvalds 已提交
1821

1822
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1823
	saved_fn = fn;
L
Linus Torvalds 已提交
1824

D
David Ahern 已提交
1825 1826 1827
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		oif = 0;

M
Martin KaFai Lau 已提交
1828
redo_rt6_select:
W
Wei Wang 已提交
1829
	rt = rt6_select(net, fn, oif, strict);
1830
	if (rt->rt6i_nsiblings)
1831
		rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
D
David Ahern 已提交
1832
	if (rt == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
1833 1834 1835
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto redo_rt6_select;
1836 1837 1838 1839 1840 1841
		else if (strict & RT6_LOOKUP_F_REACHABLE) {
			/* also consider unreachable route */
			strict &= ~RT6_LOOKUP_F_REACHABLE;
			fn = saved_fn;
			goto redo_rt6_select;
		}
M
Martin KaFai Lau 已提交
1842 1843
	}

1844 1845 1846 1847
	/*Search through exception table */
	rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
	if (rt_cache)
		rt = rt_cache;
1848

D
David Ahern 已提交
1849 1850
	if (rt == net->ipv6.fib6_null_entry) {
		rt = net->ipv6.ip6_null_entry;
1851
		rcu_read_unlock();
1852
		dst_hold(&rt->dst);
1853
		trace_fib6_table_lookup(net, rt, table, fl6);
1854 1855
		return rt;
	} else if (rt->rt6i_flags & RTF_CACHE) {
1856
		if (ip6_hold_safe(net, &rt, true))
1857
			dst_use_noref(&rt->dst, jiffies);
1858

1859
		rcu_read_unlock();
1860
		trace_fib6_table_lookup(net, rt, table, fl6);
M
Martin KaFai Lau 已提交
1861
		return rt;
1862 1863 1864 1865 1866 1867 1868 1869 1870 1871
	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
			    !(rt->rt6i_flags & RTF_GATEWAY))) {
		/* Create a RTF_CACHE clone which will not be
		 * owned by the fib6 tree.  It is for the special case where
		 * the daddr in the skb during the neighbor look-up is different
		 * from the fl6->daddr used to look-up route here.
		 */

		struct rt6_info *uncached_rt;

1872 1873 1874
		if (ip6_hold_safe(net, &rt, true)) {
			dst_use_noref(&rt->dst, jiffies);
		} else {
1875
			rcu_read_unlock();
1876 1877 1878
			uncached_rt = rt;
			goto uncached_rt_out;
		}
1879
		rcu_read_unlock();
M
Martin KaFai Lau 已提交
1880

1881 1882
		uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
		dst_release(&rt->dst);
T
Thomas Graf 已提交
1883

1884 1885 1886 1887
		if (uncached_rt) {
			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
			 * No need for another dst_hold()
			 */
1888
			rt6_uncached_list_add(uncached_rt);
W
Wei Wang 已提交
1889
			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1890
		} else {
1891
			uncached_rt = net->ipv6.ip6_null_entry;
1892 1893
			dst_hold(&uncached_rt->dst);
		}
D
David Ahern 已提交
1894

1895
uncached_rt_out:
1896
		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1897 1898
		return uncached_rt;

M
Martin KaFai Lau 已提交
1899 1900 1901 1902 1903
	} else {
		/* Get a percpu copy */

		struct rt6_info *pcpu_rt;

1904
		dst_use_noref(&rt->dst, jiffies);
1905
		local_bh_disable();
M
Martin KaFai Lau 已提交
1906 1907
		pcpu_rt = rt6_get_pcpu_route(rt);

1908
		if (!pcpu_rt) {
1909 1910
			/* atomic_inc_not_zero() is needed when using rcu */
			if (atomic_inc_not_zero(&rt->rt6i_ref)) {
1911
				/* No dst_hold() on rt is needed because grabbing
1912 1913
				 * rt->rt6i_ref makes sure rt can't be released.
				 */
1914
				pcpu_rt = rt6_make_pcpu_route(net, rt);
1915 1916 1917 1918 1919 1920
				rt6_release(rt);
			} else {
				/* rt is already removed from tree */
				pcpu_rt = net->ipv6.ip6_null_entry;
				dst_hold(&pcpu_rt->dst);
			}
1921
		}
1922 1923
		local_bh_enable();
		rcu_read_unlock();
1924
		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
M
Martin KaFai Lau 已提交
1925 1926
		return pcpu_rt;
	}
L
Linus Torvalds 已提交
1927
}
1928
EXPORT_SYMBOL_GPL(ip6_pol_route);
L
Linus Torvalds 已提交
1929

D
David Ahern 已提交
1930 1931 1932 1933 1934
static struct rt6_info *ip6_pol_route_input(struct net *net,
					    struct fib6_table *table,
					    struct flowi6 *fl6,
					    const struct sk_buff *skb,
					    int flags)
1935
{
D
David Ahern 已提交
1936
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1937 1938
}

1939 1940
struct dst_entry *ip6_route_input_lookup(struct net *net,
					 struct net_device *dev,
D
David Ahern 已提交
1941 1942 1943
					 struct flowi6 *fl6,
					 const struct sk_buff *skb,
					 int flags)
1944 1945 1946 1947
{
	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
		flags |= RT6_LOOKUP_F_IFACE;

D
David Ahern 已提交
1948
	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1949
}
1950
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1951

1952
static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1953 1954
				  struct flow_keys *keys,
				  struct flow_keys *flkeys)
1955 1956 1957
{
	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
	const struct ipv6hdr *key_iph = outer_iph;
1958
	struct flow_keys *_flkeys = flkeys;
1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979
	const struct ipv6hdr *inner_iph;
	const struct icmp6hdr *icmph;
	struct ipv6hdr _inner_iph;

	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
		goto out;

	icmph = icmp6_hdr(skb);
	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
	    icmph->icmp6_type != ICMPV6_PARAMPROB)
		goto out;

	inner_iph = skb_header_pointer(skb,
				       skb_transport_offset(skb) + sizeof(*icmph),
				       sizeof(_inner_iph), &_inner_iph);
	if (!inner_iph)
		goto out;

	key_iph = inner_iph;
1980
	_flkeys = NULL;
1981
out:
1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992
	if (_flkeys) {
		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
		keys->tags.flow_label = _flkeys->tags.flow_label;
		keys->basic.ip_proto = _flkeys->basic.ip_proto;
	} else {
		keys->addrs.v6addrs.src = key_iph->saddr;
		keys->addrs.v6addrs.dst = key_iph->daddr;
		keys->tags.flow_label = ip6_flowinfo(key_iph);
		keys->basic.ip_proto = key_iph->nexthdr;
	}
1993 1994 1995
}

/* if skb is set it will be used and fl6 can be NULL */
1996 1997
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
		       const struct sk_buff *skb, struct flow_keys *flkeys)
1998 1999
{
	struct flow_keys hash_keys;
2000
	u32 mhash;
2001

2002
	switch (ip6_multipath_hash_policy(net)) {
2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045
	case 0:
		memset(&hash_keys, 0, sizeof(hash_keys));
		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
		if (skb) {
			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
		} else {
			hash_keys.addrs.v6addrs.src = fl6->saddr;
			hash_keys.addrs.v6addrs.dst = fl6->daddr;
			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
			hash_keys.basic.ip_proto = fl6->flowi6_proto;
		}
		break;
	case 1:
		if (skb) {
			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
			struct flow_keys keys;

			/* short-circuit if we already have L4 hash present */
			if (skb->l4_hash)
				return skb_get_hash_raw(skb) >> 1;

			memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
				skb_flow_dissect_flow_keys(skb, &keys, flag);
				flkeys = &keys;
			}
			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
			hash_keys.ports.src = flkeys->ports.src;
			hash_keys.ports.dst = flkeys->ports.dst;
			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
		} else {
			memset(&hash_keys, 0, sizeof(hash_keys));
			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			hash_keys.addrs.v6addrs.src = fl6->saddr;
			hash_keys.addrs.v6addrs.dst = fl6->daddr;
			hash_keys.ports.src = fl6->fl6_sport;
			hash_keys.ports.dst = fl6->fl6_dport;
			hash_keys.basic.ip_proto = fl6->flowi6_proto;
		}
		break;
2046
	}
2047
	mhash = flow_hash_from_keys(&hash_keys);
2048

2049
	return mhash >> 1;
2050 2051
}

T
Thomas Graf 已提交
2052 2053
void ip6_route_input(struct sk_buff *skb)
{
2054
	const struct ipv6hdr *iph = ipv6_hdr(skb);
2055
	struct net *net = dev_net(skb->dev);
2056
	int flags = RT6_LOOKUP_F_HAS_SADDR;
2057
	struct ip_tunnel_info *tun_info;
2058
	struct flowi6 fl6 = {
2059
		.flowi6_iif = skb->dev->ifindex,
2060 2061
		.daddr = iph->daddr,
		.saddr = iph->saddr,
2062
		.flowlabel = ip6_flowinfo(iph),
2063 2064
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
T
Thomas Graf 已提交
2065
	};
2066
	struct flow_keys *flkeys = NULL, _flkeys;
2067

2068
	tun_info = skb_tunnel_info(skb);
2069
	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2070
		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2071 2072 2073 2074

	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
		flkeys = &_flkeys;

2075
	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2076
		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2077
	skb_dst_drop(skb);
D
David Ahern 已提交
2078 2079
	skb_dst_set(skb,
		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
T
Thomas Graf 已提交
2080 2081
}

D
David Ahern 已提交
2082 2083 2084 2085 2086
static struct rt6_info *ip6_pol_route_output(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
					     const struct sk_buff *skb,
					     int flags)
L
Linus Torvalds 已提交
2087
{
D
David Ahern 已提交
2088
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
T
Thomas Graf 已提交
2089 2090
}

2091 2092
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
					 struct flowi6 *fl6, int flags)
T
Thomas Graf 已提交
2093
{
2094
	bool any_src;
T
Thomas Graf 已提交
2095

2096 2097 2098 2099 2100 2101 2102
	if (rt6_need_strict(&fl6->daddr)) {
		struct dst_entry *dst;

		dst = l3mdev_link_scope_lookup(net, fl6);
		if (dst)
			return dst;
	}
D
David Ahern 已提交
2103

2104
	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2105

2106
	any_src = ipv6_addr_any(&fl6->saddr);
2107
	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2108
	    (fl6->flowi6_oif && any_src))
2109
		flags |= RT6_LOOKUP_F_IFACE;
T
Thomas Graf 已提交
2110

2111
	if (!any_src)
2112
		flags |= RT6_LOOKUP_F_HAS_SADDR;
2113 2114
	else if (sk)
		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2115

D
David Ahern 已提交
2116
	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
L
Linus Torvalds 已提交
2117
}
2118
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
L
Linus Torvalds 已提交
2119

2120
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2121
{
2122
	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2123
	struct net_device *loopback_dev = net->loopback_dev;
2124 2125
	struct dst_entry *new = NULL;

2126
	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2127
		       DST_OBSOLETE_DEAD, 0);
2128
	if (rt) {
2129
		rt6_info_init(rt);
W
Wei Wang 已提交
2130
		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2131

2132
		new = &rt->dst;
2133
		new->__use = 1;
2134
		new->input = dst_discard;
E
Eric W. Biederman 已提交
2135
		new->output = dst_discard_out;
2136

2137
		dst_copy_metrics(new, &ort->dst);
2138

2139
		rt->rt6i_idev = in6_dev_get(loopback_dev);
A
Alexey Dobriyan 已提交
2140
		rt->rt6i_gateway = ort->rt6i_gateway;
2141
		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2142 2143 2144 2145 2146 2147 2148 2149
		rt->rt6i_metric = 0;

		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
	}

2150 2151
	dst_release(dst_orig);
	return new ? new : ERR_PTR(-ENOMEM);
2152 2153
}

L
Linus Torvalds 已提交
2154 2155 2156 2157
/*
 *	Destination cache support functions
 */

2158 2159
static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
{
2160
	u32 rt_cookie = 0;
2161 2162

	if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
2163 2164 2165 2166 2167 2168 2169 2170 2171 2172
		return NULL;

	if (rt6_check_expired(rt))
		return NULL;

	return &rt->dst;
}

static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
{
2173 2174
	if (!__rt6_check_expired(rt) &&
	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2175
	    rt6_check(rt->from, cookie))
2176 2177 2178 2179 2180
		return &rt->dst;
	else
		return NULL;
}

L
Linus Torvalds 已提交
2181 2182 2183 2184 2185 2186
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
	struct rt6_info *rt;

	rt = (struct rt6_info *) dst;

2187 2188 2189 2190
	/* All IPV6 dsts are created with ->obsolete set to the value
	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
	 * into this function always.
	 */
2191

2192
	if (rt->rt6i_flags & RTF_PCPU ||
2193
	    (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
2194 2195 2196
		return rt6_dst_from_check(rt, cookie);
	else
		return rt6_check(rt, cookie);
L
Linus Torvalds 已提交
2197 2198 2199 2200 2201 2202 2203
}

static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;

	if (rt) {
2204 2205
		if (rt->rt6i_flags & RTF_CACHE) {
			if (rt6_check_expired(rt)) {
2206
				ip6_del_rt(dev_net(dst->dev), rt);
2207 2208 2209
				dst = NULL;
			}
		} else {
L
Linus Torvalds 已提交
2210
			dst_release(dst);
2211 2212
			dst = NULL;
		}
L
Linus Torvalds 已提交
2213
	}
2214
	return dst;
L
Linus Torvalds 已提交
2215 2216 2217 2218 2219 2220
}

static void ip6_link_failure(struct sk_buff *skb)
{
	struct rt6_info *rt;

2221
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
L
Linus Torvalds 已提交
2222

E
Eric Dumazet 已提交
2223
	rt = (struct rt6_info *) skb_dst(skb);
L
Linus Torvalds 已提交
2224
	if (rt) {
2225
		if (rt->rt6i_flags & RTF_CACHE) {
W
Wei Wang 已提交
2226
			if (dst_hold_safe(&rt->dst))
2227
				ip6_del_rt(dev_net(rt->dst.dev), rt);
2228 2229 2230 2231 2232 2233 2234 2235
		} else {
			struct fib6_node *fn;

			rcu_read_lock();
			fn = rcu_dereference(rt->rt6i_node);
			if (fn && (rt->rt6i_flags & RTF_DEFAULT))
				fn->fn_sernum = -1;
			rcu_read_unlock();
2236
		}
L
Linus Torvalds 已提交
2237 2238 2239
	}
}

2240 2241 2242 2243
static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
	struct net *net = dev_net(rt->dst.dev);

2244
	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2245 2246 2247 2248
	rt->rt6i_flags |= RTF_MODIFIED;
	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}

2249 2250 2251
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
	return !(rt->rt6i_flags & RTF_CACHE) &&
W
Wei Wang 已提交
2252 2253
		(rt->rt6i_flags & RTF_PCPU ||
		 rcu_access_pointer(rt->rt6i_node));
2254 2255
}

2256 2257
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
				 const struct ipv6hdr *iph, u32 mtu)
L
Linus Torvalds 已提交
2258
{
2259
	const struct in6_addr *daddr, *saddr;
2260
	struct rt6_info *rt6 = (struct rt6_info *)dst;
L
Linus Torvalds 已提交
2261

2262 2263
	if (rt6->rt6i_flags & RTF_LOCAL)
		return;
2264

2265 2266 2267
	if (dst_metric_locked(dst, RTAX_MTU))
		return;

2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278
	if (iph) {
		daddr = &iph->daddr;
		saddr = &iph->saddr;
	} else if (sk) {
		daddr = &sk->sk_v6_daddr;
		saddr = &inet6_sk(sk)->saddr;
	} else {
		daddr = NULL;
		saddr = NULL;
	}
	dst_confirm_neigh(dst, daddr);
2279 2280 2281
	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
	if (mtu >= dst_mtu(dst))
		return;
2282

2283
	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2284
		rt6_do_update_pmtu(rt6, mtu);
2285 2286 2287
		/* update rt6_ex->stamp for cache */
		if (rt6->rt6i_flags & RTF_CACHE)
			rt6_update_exception_stamp_rt(rt6);
2288
	} else if (daddr) {
2289 2290
		struct rt6_info *nrt6;

2291
		nrt6 = ip6_rt_cache_alloc(rt6->from, daddr, saddr);
2292 2293
		if (nrt6) {
			rt6_do_update_pmtu(nrt6, mtu);
2294
			if (rt6_insert_exception(nrt6, rt6->from))
2295
				dst_release_immediate(&nrt6->dst);
2296
		}
L
Linus Torvalds 已提交
2297 2298 2299
	}
}

2300 2301 2302 2303 2304 2305
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
			       struct sk_buff *skb, u32 mtu)
{
	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
}

2306
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2307
		     int oif, u32 mark, kuid_t uid)
2308 2309 2310 2311 2312 2313 2314
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
2315
	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2316 2317
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
2318
	fl6.flowlabel = ip6_flowinfo(iph);
2319
	fl6.flowi6_uid = uid;
2320 2321 2322

	dst = ip6_route_output(net, NULL, &fl6);
	if (!dst->error)
2323
		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2324 2325 2326 2327 2328 2329
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
2330 2331
	struct dst_entry *dst;

2332
	ip6_update_pmtu(skb, sock_net(sk), mtu,
2333
			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2334 2335 2336 2337 2338 2339 2340 2341 2342 2343

	dst = __sk_dst_get(sk);
	if (!dst || !dst->obsolete ||
	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
		return;

	bh_lock_sock(sk);
	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		ip6_datagram_dst_update(sk, false);
	bh_unlock_sock(sk);
2344 2345 2346
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363
void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
			   const struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_SUBTREES
	struct ipv6_pinfo *np = inet6_sk(sk);
#endif

	ip6_dst_store(sk, dst,
		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
		      &sk->sk_v6_daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
		      &np->saddr :
#endif
		      NULL);
}

2364 2365 2366 2367 2368 2369 2370 2371 2372
/* Handle redirects */
struct ip6rd_flowi {
	struct flowi6 fl6;
	struct in6_addr gateway;
};

static struct rt6_info *__ip6_route_redirect(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
D
David Ahern 已提交
2373
					     const struct sk_buff *skb,
2374 2375 2376
					     int flags)
{
	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2377
	struct rt6_info *rt, *rt_cache;
2378 2379 2380
	struct fib6_node *fn;

	/* Get the "current" route for this destination and
A
Alexander Alemayhu 已提交
2381
	 * check if the redirect has come from appropriate router.
2382 2383 2384 2385 2386 2387 2388 2389
	 *
	 * RFC 4861 specifies that redirects should only be
	 * accepted if they come from the nexthop to the target.
	 * Due to the way the routes are chosen, this notion
	 * is a bit fuzzy and one might need to check all possible
	 * routes.
	 */

2390
	rcu_read_lock();
2391 2392
	fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
2393
	for_each_fib6_node_rt_rcu(fn) {
2394
		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2395
			continue;
2396
		if (fib6_check_expired(rt))
2397
			continue;
2398
		if (rt->rt6i_flags & RTF_REJECT)
2399 2400 2401
			break;
		if (!(rt->rt6i_flags & RTF_GATEWAY))
			continue;
2402
		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2403
			continue;
2404 2405 2406 2407 2408
		/* rt_cache's gateway might be different from its 'parent'
		 * in the case of an ip redirect.
		 * So we keep searching in the exception table if the gateway
		 * is different.
		 */
2409
		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2410 2411 2412 2413 2414 2415 2416 2417 2418
			rt_cache = rt6_find_cached_rt(rt,
						      &fl6->daddr,
						      &fl6->saddr);
			if (rt_cache &&
			    ipv6_addr_equal(&rdfl->gateway,
					    &rt_cache->rt6i_gateway)) {
				rt = rt_cache;
				break;
			}
2419
			continue;
2420
		}
2421 2422 2423 2424
		break;
	}

	if (!rt)
D
David Ahern 已提交
2425
		rt = net->ipv6.fib6_null_entry;
2426
	else if (rt->rt6i_flags & RTF_REJECT) {
2427
		rt = net->ipv6.ip6_null_entry;
2428 2429 2430
		goto out;
	}

D
David Ahern 已提交
2431
	if (rt == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
2432 2433 2434
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
2435
	}
M
Martin KaFai Lau 已提交
2436

2437
out:
2438
	ip6_hold_safe(net, &rt, true);
2439

2440
	rcu_read_unlock();
2441

2442
	trace_fib6_table_lookup(net, rt, table, fl6);
2443 2444 2445 2446
	return rt;
};

static struct dst_entry *ip6_route_redirect(struct net *net,
D
David Ahern 已提交
2447 2448 2449
					    const struct flowi6 *fl6,
					    const struct sk_buff *skb,
					    const struct in6_addr *gateway)
2450 2451 2452 2453 2454 2455 2456
{
	int flags = RT6_LOOKUP_F_HAS_SADDR;
	struct ip6rd_flowi rdfl;

	rdfl.fl6 = *fl6;
	rdfl.gateway = *gateway;

D
David Ahern 已提交
2457
	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2458 2459 2460
				flags, __ip6_route_redirect);
}

2461 2462
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
		  kuid_t uid)
2463 2464 2465 2466 2467 2468
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
2469
	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2470 2471 2472 2473
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
2474
	fl6.flowlabel = ip6_flowinfo(iph);
2475
	fl6.flowi6_uid = uid;
2476

D
David Ahern 已提交
2477
	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2478
	rt6_do_redirect(dst, NULL, skb);
2479 2480 2481 2482
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

2483 2484 2485 2486 2487 2488 2489 2490 2491
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
			    u32 mark)
{
	const struct ipv6hdr *iph = ipv6_hdr(skb);
	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
2492
	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2493 2494 2495 2496
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.daddr = msg->dest;
	fl6.saddr = iph->daddr;
2497
	fl6.flowi6_uid = sock_net_uid(net, NULL);
2498

D
David Ahern 已提交
2499
	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2500
	rt6_do_redirect(dst, NULL, skb);
2501 2502 2503
	dst_release(dst);
}

2504 2505
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
2506 2507
	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
		     sk->sk_uid);
2508 2509 2510
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

2511
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
L
Linus Torvalds 已提交
2512
{
2513 2514 2515 2516
	struct net_device *dev = dst->dev;
	unsigned int mtu = dst_mtu(dst);
	struct net *net = dev_net(dev);

L
Linus Torvalds 已提交
2517 2518
	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

2519 2520
	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
L
Linus Torvalds 已提交
2521 2522

	/*
2523 2524 2525
	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
	 * IPV6_MAXPLEN is also valid and means: "any MSS,
L
Linus Torvalds 已提交
2526 2527 2528 2529 2530 2531 2532
	 * rely only on pmtu discovery"
	 */
	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
		mtu = IPV6_MAXPLEN;
	return mtu;
}

2533
static unsigned int ip6_mtu(const struct dst_entry *dst)
2534 2535
{
	struct inet6_dev *idev;
2536
	unsigned int mtu;
2537 2538

	mtu = dst_metric_raw(dst, RTAX_MTU);
2539
	if (mtu)
E
Eric Dumazet 已提交
2540
		goto out;
2541 2542

	mtu = IPV6_MIN_MTU;
2543 2544 2545 2546 2547 2548 2549

	rcu_read_lock();
	idev = __in6_dev_get(dst->dev);
	if (idev)
		mtu = idev->cnf.mtu6;
	rcu_read_unlock();

E
Eric Dumazet 已提交
2550
out:
2551 2552 2553
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2554 2555
}

2556
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2557
				  struct flowi6 *fl6)
L
Linus Torvalds 已提交
2558
{
2559
	struct dst_entry *dst;
L
Linus Torvalds 已提交
2560 2561
	struct rt6_info *rt;
	struct inet6_dev *idev = in6_dev_get(dev);
2562
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
2563

2564
	if (unlikely(!idev))
E
Eric Dumazet 已提交
2565
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
2566

2567
	rt = ip6_dst_alloc(net, dev, 0);
2568
	if (unlikely(!rt)) {
L
Linus Torvalds 已提交
2569
		in6_dev_put(idev);
2570
		dst = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
2571 2572 2573
		goto out;
	}

2574
	rt->dst.flags |= DST_HOST;
2575
	rt->dst.input = ip6_input;
2576
	rt->dst.output  = ip6_output;
2577
	rt->rt6i_gateway  = fl6->daddr;
2578
	rt->rt6i_dst.addr = fl6->daddr;
2579 2580
	rt->rt6i_dst.plen = 128;
	rt->rt6i_idev     = idev;
L
Li RongQing 已提交
2581
	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
L
Linus Torvalds 已提交
2582

2583
	/* Add this dst into uncached_list so that rt6_disable_ip() can
2584 2585 2586
	 * do proper release of the net_device
	 */
	rt6_uncached_list_add(rt);
W
Wei Wang 已提交
2587
	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
L
Linus Torvalds 已提交
2588

2589 2590
	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

L
Linus Torvalds 已提交
2591
out:
2592
	return dst;
L
Linus Torvalds 已提交
2593 2594
}

2595
static int ip6_dst_gc(struct dst_ops *ops)
L
Linus Torvalds 已提交
2596
{
2597
	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2598 2599 2600 2601 2602
	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2603
	int entries;
2604

2605
	entries = dst_entries_get_fast(ops);
2606
	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2607
	    entries <= rt_max_size)
L
Linus Torvalds 已提交
2608 2609
		goto out;

2610
	net->ipv6.ip6_rt_gc_expire++;
2611
	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2612 2613
	entries = dst_entries_get_slow(ops);
	if (entries < ops->gc_thresh)
2614
		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
L
Linus Torvalds 已提交
2615
out:
2616
	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2617
	return entries > rt_max_size;
L
Linus Torvalds 已提交
2618 2619
}

2620 2621
static int ip6_convert_metrics(struct net *net, struct rt6_info *rt,
			       struct fib6_config *cfg)
2622
{
2623
	int err = 0;
2624

2625 2626 2627 2628 2629
	if (cfg->fc_mx) {
		rt->fib6_metrics = kzalloc(sizeof(*rt->fib6_metrics),
					   GFP_KERNEL);
		if (unlikely(!rt->fib6_metrics))
			return -ENOMEM;
2630

2631
		refcount_set(&rt->fib6_metrics->refcnt, 1);
2632

2633 2634
		err = ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len,
					 rt->fib6_metrics->metrics);
2635
	}
2636

2637
	return err;
2638
}
L
Linus Torvalds 已提交
2639

2640 2641
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
					    struct fib6_config *cfg,
2642 2643
					    const struct in6_addr *gw_addr,
					    u32 tbid, int flags)
2644 2645 2646 2647 2648 2649 2650 2651 2652
{
	struct flowi6 fl6 = {
		.flowi6_oif = cfg->fc_ifindex,
		.daddr = *gw_addr,
		.saddr = cfg->fc_prefsrc,
	};
	struct fib6_table *table;
	struct rt6_info *rt;

2653
	table = fib6_get_table(net, tbid);
2654 2655 2656 2657 2658 2659
	if (!table)
		return NULL;

	if (!ipv6_addr_any(&cfg->fc_prefsrc))
		flags |= RT6_LOOKUP_F_HAS_SADDR;

2660
	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
D
David Ahern 已提交
2661
	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2662 2663 2664 2665 2666 2667 2668 2669 2670 2671

	/* if table lookup failed, fall back to full lookup */
	if (rt == net->ipv6.ip6_null_entry) {
		ip6_rt_put(rt);
		rt = NULL;
	}

	return rt;
}

2672 2673
static int ip6_route_check_nh_onlink(struct net *net,
				     struct fib6_config *cfg,
2674
				     const struct net_device *dev,
2675 2676
				     struct netlink_ext_ack *extack)
{
2677
	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2678 2679 2680 2681 2682 2683 2684 2685
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
	struct rt6_info *grt;
	int err;

	err = 0;
	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
	if (grt) {
2686 2687
		if (!grt->dst.error &&
		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2688 2689
			NL_SET_ERR_MSG(extack,
				       "Nexthop has invalid gateway or device mismatch");
2690 2691 2692 2693 2694 2695 2696 2697 2698
			err = -EINVAL;
		}

		ip6_rt_put(grt);
	}

	return err;
}

2699 2700 2701 2702 2703 2704 2705 2706 2707 2708 2709
static int ip6_route_check_nh(struct net *net,
			      struct fib6_config *cfg,
			      struct net_device **_dev,
			      struct inet6_dev **idev)
{
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	struct net_device *dev = _dev ? *_dev : NULL;
	struct rt6_info *grt = NULL;
	int err = -EHOSTUNREACH;

	if (cfg->fc_table) {
2710 2711 2712 2713
		int flags = RT6_LOOKUP_F_IFACE;

		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
					  cfg->fc_table, flags);
2714 2715 2716 2717 2718 2719 2720 2721 2722 2723
		if (grt) {
			if (grt->rt6i_flags & RTF_GATEWAY ||
			    (dev && dev != grt->dst.dev)) {
				ip6_rt_put(grt);
				grt = NULL;
			}
		}
	}

	if (!grt)
D
David Ahern 已提交
2724
		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2725 2726 2727 2728 2729 2730 2731 2732 2733 2734 2735 2736 2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747 2748 2749

	if (!grt)
		goto out;

	if (dev) {
		if (dev != grt->dst.dev) {
			ip6_rt_put(grt);
			goto out;
		}
	} else {
		*_dev = dev = grt->dst.dev;
		*idev = grt->rt6i_idev;
		dev_hold(dev);
		in6_dev_hold(grt->rt6i_idev);
	}

	if (!(grt->rt6i_flags & RTF_GATEWAY))
		err = 0;

	ip6_rt_put(grt);

out:
	return err;
}

2750 2751 2752 2753 2754 2755
static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
			   struct net_device **_dev, struct inet6_dev **idev,
			   struct netlink_ext_ack *extack)
{
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	int gwa_type = ipv6_addr_type(gw_addr);
2756
	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2757
	const struct net_device *dev = *_dev;
2758
	bool need_addr_check = !dev;
2759 2760 2761 2762 2763 2764 2765
	int err = -EINVAL;

	/* if gw_addr is local we will fail to detect this in case
	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
	 * will return already-added prefix route via interface that
	 * prefix route was assigned to, which might be non-loopback.
	 */
2766 2767 2768
	if (dev &&
	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787 2788 2789 2790 2791 2792 2793 2794 2795 2796 2797 2798 2799 2800 2801 2802 2803 2804 2805 2806 2807
		goto out;
	}

	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
		/* IPv6 strictly inhibits using not link-local
		 * addresses as nexthop address.
		 * Otherwise, router will not able to send redirects.
		 * It is very good, but in some (rare!) circumstances
		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
		 * some exceptions. --ANK
		 * We allow IPv4-mapped nexthops to support RFC4798-type
		 * addressing
		 */
		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
			NL_SET_ERR_MSG(extack, "Invalid gateway address");
			goto out;
		}

		if (cfg->fc_flags & RTNH_F_ONLINK)
			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
		else
			err = ip6_route_check_nh(net, cfg, _dev, idev);

		if (err)
			goto out;
	}

	/* reload in case device was changed */
	dev = *_dev;

	err = -EINVAL;
	if (!dev) {
		NL_SET_ERR_MSG(extack, "Egress device not specified");
		goto out;
	} else if (dev->flags & IFF_LOOPBACK) {
		NL_SET_ERR_MSG(extack,
			       "Egress device can not be loopback device for this route");
		goto out;
	}
2808 2809 2810 2811 2812 2813 2814 2815 2816 2817

	/* if we did not check gw_addr above, do so now that the
	 * egress device has been resolved.
	 */
	if (need_addr_check &&
	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
		goto out;
	}

2818 2819 2820 2821 2822
	err = 0;
out:
	return err;
}

2823 2824
static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
					      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
2825
{
2826
	struct net *net = cfg->fc_nlinfo.nl_net;
L
Linus Torvalds 已提交
2827 2828 2829
	struct rt6_info *rt = NULL;
	struct net_device *dev = NULL;
	struct inet6_dev *idev = NULL;
T
Thomas Graf 已提交
2830
	struct fib6_table *table;
L
Linus Torvalds 已提交
2831
	int addr_type;
2832
	int err = -EINVAL;
L
Linus Torvalds 已提交
2833

2834
	/* RTF_PCPU is an internal flag; can not be set by userspace */
2835 2836
	if (cfg->fc_flags & RTF_PCPU) {
		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2837
		goto out;
2838
	}
2839

2840 2841 2842 2843 2844 2845
	/* RTF_CACHE is an internal flag; can not be set by userspace */
	if (cfg->fc_flags & RTF_CACHE) {
		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
		goto out;
	}

2846 2847 2848 2849 2850
	if (cfg->fc_type > RTN_MAX) {
		NL_SET_ERR_MSG(extack, "Invalid route type");
		goto out;
	}

2851 2852 2853 2854 2855 2856
	if (cfg->fc_dst_len > 128) {
		NL_SET_ERR_MSG(extack, "Invalid prefix length");
		goto out;
	}
	if (cfg->fc_src_len > 128) {
		NL_SET_ERR_MSG(extack, "Invalid source address length");
2857
		goto out;
2858
	}
L
Linus Torvalds 已提交
2859
#ifndef CONFIG_IPV6_SUBTREES
2860 2861 2862
	if (cfg->fc_src_len) {
		NL_SET_ERR_MSG(extack,
			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2863
		goto out;
2864
	}
L
Linus Torvalds 已提交
2865
#endif
2866
	if (cfg->fc_ifindex) {
L
Linus Torvalds 已提交
2867
		err = -ENODEV;
2868
		dev = dev_get_by_index(net, cfg->fc_ifindex);
L
Linus Torvalds 已提交
2869 2870 2871 2872 2873 2874 2875
		if (!dev)
			goto out;
		idev = in6_dev_get(dev);
		if (!idev)
			goto out;
	}

2876 2877
	if (cfg->fc_metric == 0)
		cfg->fc_metric = IP6_RT_PRIO_USER;
L
Linus Torvalds 已提交
2878

2879 2880 2881 2882 2883 2884 2885 2886 2887 2888 2889 2890 2891 2892 2893
	if (cfg->fc_flags & RTNH_F_ONLINK) {
		if (!dev) {
			NL_SET_ERR_MSG(extack,
				       "Nexthop device required for onlink");
			err = -ENODEV;
			goto out;
		}

		if (!(dev->flags & IFF_UP)) {
			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
			err = -ENETDOWN;
			goto out;
		}
	}

2894
	err = -ENOBUFS;
2895 2896
	if (cfg->fc_nlinfo.nlh &&
	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2897
		table = fib6_get_table(net, cfg->fc_table);
2898
		if (!table) {
2899
			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2900 2901 2902 2903 2904
			table = fib6_new_table(net, cfg->fc_table);
		}
	} else {
		table = fib6_new_table(net, cfg->fc_table);
	}
2905 2906

	if (!table)
T
Thomas Graf 已提交
2907 2908
		goto out;

2909 2910
	rt = ip6_dst_alloc(net, NULL,
			   (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
L
Linus Torvalds 已提交
2911

2912
	if (!rt) {
L
Linus Torvalds 已提交
2913 2914 2915 2916
		err = -ENOMEM;
		goto out;
	}

2917 2918 2919 2920
	err = ip6_convert_metrics(net, rt, cfg);
	if (err < 0)
		goto out;

2921
	if (cfg->fc_flags & RTF_EXPIRES)
2922
		fib6_set_expires(rt, jiffies +
2923 2924
				clock_t_to_jiffies(cfg->fc_expires));
	else
2925
		fib6_clean_expires(rt);
L
Linus Torvalds 已提交
2926

2927 2928 2929 2930 2931
	if (cfg->fc_protocol == RTPROT_UNSPEC)
		cfg->fc_protocol = RTPROT_BOOT;
	rt->rt6i_protocol = cfg->fc_protocol;

	addr_type = ipv6_addr_type(&cfg->fc_dst);
L
Linus Torvalds 已提交
2932

2933 2934 2935
	if (cfg->fc_encap) {
		struct lwtunnel_state *lwtstate;

2936
		err = lwtunnel_build_state(cfg->fc_encap_type,
2937
					   cfg->fc_encap, AF_INET6, cfg,
2938
					   &lwtstate, extack);
2939 2940
		if (err)
			goto out;
2941
		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2942 2943
	}

2944 2945
	ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
	rt->rt6i_dst.plen = cfg->fc_dst_len;
2946
	if (rt->rt6i_dst.plen == 128)
2947 2948
		rt->dst.flags |= DST_HOST;

L
Linus Torvalds 已提交
2949
#ifdef CONFIG_IPV6_SUBTREES
2950 2951
	ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
	rt->rt6i_src.plen = cfg->fc_src_len;
L
Linus Torvalds 已提交
2952 2953
#endif

2954
	rt->rt6i_metric = cfg->fc_metric;
2955
	rt->fib6_nh.nh_weight = 1;
L
Linus Torvalds 已提交
2956

2957 2958
	rt->fib6_type = cfg->fc_type;

L
Linus Torvalds 已提交
2959 2960 2961
	/* We cannot add true routes via loopback here,
	   they would result in kernel looping; promote them to reject routes
	 */
2962
	if ((cfg->fc_flags & RTF_REJECT) ||
2963 2964 2965
	    (dev && (dev->flags & IFF_LOOPBACK) &&
	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
	     !(cfg->fc_flags & RTF_LOCAL))) {
L
Linus Torvalds 已提交
2966
		/* hold loopback dev/idev if we haven't done so. */
2967
		if (dev != net->loopback_dev) {
L
Linus Torvalds 已提交
2968 2969 2970 2971
			if (dev) {
				dev_put(dev);
				in6_dev_put(idev);
			}
2972
			dev = net->loopback_dev;
L
Linus Torvalds 已提交
2973 2974 2975 2976 2977 2978 2979 2980 2981 2982 2983
			dev_hold(dev);
			idev = in6_dev_get(dev);
			if (!idev) {
				err = -ENODEV;
				goto out;
			}
		}
		rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
		goto install_route;
	}

2984
	if (cfg->fc_flags & RTF_GATEWAY) {
2985 2986
		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
		if (err)
2987
			goto out;
L
Linus Torvalds 已提交
2988

2989
		rt->fib6_nh.nh_gw = rt->rt6i_gateway = cfg->fc_gateway;
L
Linus Torvalds 已提交
2990 2991 2992
	}

	err = -ENODEV;
2993
	if (!dev)
L
Linus Torvalds 已提交
2994 2995
		goto out;

2996 2997 2998 2999 3000 3001
	if (idev->cnf.disable_ipv6) {
		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
		err = -EACCES;
		goto out;
	}

3002 3003 3004 3005 3006 3007
	if (!(dev->flags & IFF_UP)) {
		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
		err = -ENETDOWN;
		goto out;
	}

3008 3009
	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3010
			NL_SET_ERR_MSG(extack, "Invalid source address");
3011 3012 3013
			err = -EINVAL;
			goto out;
		}
A
Alexey Dobriyan 已提交
3014
		rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
3015 3016 3017 3018
		rt->rt6i_prefsrc.plen = 128;
	} else
		rt->rt6i_prefsrc.plen = 0;

3019
	rt->rt6i_flags = cfg->fc_flags;
L
Linus Torvalds 已提交
3020 3021

install_route:
3022 3023
	if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
	    !netif_carrier_ok(dev))
3024 3025 3026
		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
	rt->fib6_nh.nh_dev = rt->dst.dev = dev;
L
Linus Torvalds 已提交
3027
	rt->rt6i_idev = idev;
T
Thomas Graf 已提交
3028
	rt->rt6i_table = table;
3029

3030
	cfg->fc_nlinfo.nl_net = dev_net(dev);
3031

3032
	return rt;
3033 3034 3035 3036 3037
out:
	if (dev)
		dev_put(dev);
	if (idev)
		in6_dev_put(idev);
3038 3039
	if (rt)
		dst_release_immediate(&rt->dst);
3040

3041
	return ERR_PTR(err);
3042 3043
}

3044
int ip6_route_add(struct fib6_config *cfg, struct netlink_ext_ack *extack)
3045
{
3046
	struct rt6_info *rt;
3047 3048
	int err;

3049
	rt = ip6_route_info_create(cfg, extack);
3050 3051
	if (IS_ERR(rt))
		return PTR_ERR(rt);
3052

3053
	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3054

L
Linus Torvalds 已提交
3055 3056 3057
	return err;
}

3058
static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
3059
{
3060
	struct net *net = info->nl_net;
T
Thomas Graf 已提交
3061
	struct fib6_table *table;
3062
	int err;
L
Linus Torvalds 已提交
3063

D
David Ahern 已提交
3064
	if (rt == net->ipv6.fib6_null_entry) {
3065 3066 3067
		err = -ENOENT;
		goto out;
	}
3068

T
Thomas Graf 已提交
3069
	table = rt->rt6i_table;
3070
	spin_lock_bh(&table->tb6_lock);
3071
	err = fib6_del(rt, info);
3072
	spin_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
3073

3074
out:
A
Amerigo Wang 已提交
3075
	ip6_rt_put(rt);
L
Linus Torvalds 已提交
3076 3077 3078
	return err;
}

3079
int ip6_del_rt(struct net *net, struct rt6_info *rt)
3080
{
3081 3082
	struct nl_info info = { .nl_net = net };

3083
	return __ip6_del_rt(rt, &info);
3084 3085
}

3086 3087 3088
static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
{
	struct nl_info *info = &cfg->fc_nlinfo;
3089
	struct net *net = info->nl_net;
3090
	struct sk_buff *skb = NULL;
3091
	struct fib6_table *table;
3092
	int err = -ENOENT;
3093

D
David Ahern 已提交
3094
	if (rt == net->ipv6.fib6_null_entry)
3095
		goto out_put;
3096
	table = rt->rt6i_table;
3097
	spin_lock_bh(&table->tb6_lock);
3098 3099 3100 3101

	if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
		struct rt6_info *sibling, *next_sibling;

3102 3103 3104 3105 3106
		/* prefer to send a single notification with all hops */
		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
		if (skb) {
			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

3107
			if (rt6_fill_node(net, skb, rt, NULL,
3108 3109 3110 3111 3112 3113 3114 3115
					  NULL, NULL, 0, RTM_DELROUTE,
					  info->portid, seq, 0) < 0) {
				kfree_skb(skb);
				skb = NULL;
			} else
				info->skip_notify = 1;
		}

3116 3117 3118 3119 3120
		list_for_each_entry_safe(sibling, next_sibling,
					 &rt->rt6i_siblings,
					 rt6i_siblings) {
			err = fib6_del(sibling, info);
			if (err)
3121
				goto out_unlock;
3122 3123 3124 3125
		}
	}

	err = fib6_del(rt, info);
3126
out_unlock:
3127
	spin_unlock_bh(&table->tb6_lock);
3128
out_put:
3129
	ip6_rt_put(rt);
3130 3131

	if (skb) {
3132
		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3133 3134
			    info->nlh, gfp_any());
	}
3135 3136 3137
	return err;
}

3138 3139
static int ip6_route_del(struct fib6_config *cfg,
			 struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
3140
{
3141
	struct rt6_info *rt, *rt_cache;
T
Thomas Graf 已提交
3142
	struct fib6_table *table;
L
Linus Torvalds 已提交
3143 3144 3145
	struct fib6_node *fn;
	int err = -ESRCH;

3146
	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3147 3148
	if (!table) {
		NL_SET_ERR_MSG(extack, "FIB table does not exist");
T
Thomas Graf 已提交
3149
		return err;
3150
	}
T
Thomas Graf 已提交
3151

3152
	rcu_read_lock();
L
Linus Torvalds 已提交
3153

T
Thomas Graf 已提交
3154
	fn = fib6_locate(&table->tb6_root,
3155
			 &cfg->fc_dst, cfg->fc_dst_len,
3156
			 &cfg->fc_src, cfg->fc_src_len,
3157
			 !(cfg->fc_flags & RTF_CACHE));
3158

L
Linus Torvalds 已提交
3159
	if (fn) {
3160
		for_each_fib6_node_rt_rcu(fn) {
3161 3162 3163 3164 3165 3166 3167
			if (cfg->fc_flags & RTF_CACHE) {
				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
							      &cfg->fc_src);
				if (!rt_cache)
					continue;
				rt = rt_cache;
			}
3168
			if (cfg->fc_ifindex &&
3169 3170
			    (!rt->fib6_nh.nh_dev ||
			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
L
Linus Torvalds 已提交
3171
				continue;
3172
			if (cfg->fc_flags & RTF_GATEWAY &&
3173
			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
L
Linus Torvalds 已提交
3174
				continue;
3175
			if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
L
Linus Torvalds 已提交
3176
				continue;
3177 3178
			if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
				continue;
3179 3180
			if (!dst_hold_safe(&rt->dst))
				break;
3181
			rcu_read_unlock();
L
Linus Torvalds 已提交
3182

3183 3184 3185 3186 3187
			/* if gateway was specified only delete the one hop */
			if (cfg->fc_flags & RTF_GATEWAY)
				return __ip6_del_rt(rt, &cfg->fc_nlinfo);

			return __ip6_del_rt_siblings(rt, cfg);
L
Linus Torvalds 已提交
3188 3189
		}
	}
3190
	rcu_read_unlock();
L
Linus Torvalds 已提交
3191 3192 3193 3194

	return err;
}

3195
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3196 3197
{
	struct netevent_redirect netevent;
3198 3199 3200 3201
	struct rt6_info *rt, *nrt = NULL;
	struct ndisc_options ndopts;
	struct inet6_dev *in6_dev;
	struct neighbour *neigh;
3202
	struct rd_msg *msg;
3203 3204
	int optlen, on_link;
	u8 *lladdr;
3205

3206
	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3207
	optlen -= sizeof(*msg);
3208 3209

	if (optlen < 0) {
3210
		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3211 3212 3213
		return;
	}

3214
	msg = (struct rd_msg *)icmp6_hdr(skb);
3215

3216
	if (ipv6_addr_is_multicast(&msg->dest)) {
3217
		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3218 3219 3220
		return;
	}

3221
	on_link = 0;
3222
	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3223
		on_link = 1;
3224
	} else if (ipv6_addr_type(&msg->target) !=
3225
		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3226
		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240
		return;
	}

	in6_dev = __in6_dev_get(skb->dev);
	if (!in6_dev)
		return;
	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
		return;

	/* RFC2461 8.1:
	 *	The IP source address of the Redirect MUST be the same as the current
	 *	first-hop router for the specified ICMP Destination Address.
	 */

3241
	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3242 3243 3244
		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
		return;
	}
3245 3246

	lladdr = NULL;
3247 3248 3249 3250 3251 3252 3253 3254 3255
	if (ndopts.nd_opts_tgt_lladdr) {
		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
					     skb->dev);
		if (!lladdr) {
			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
			return;
		}
	}

3256
	rt = (struct rt6_info *) dst;
3257
	if (rt->rt6i_flags & RTF_REJECT) {
3258
		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3259
		return;
3260
	}
3261

3262 3263 3264 3265
	/* Redirect received -> path was valid.
	 * Look, redirects are sent only in response to data packets,
	 * so that this nexthop apparently is reachable. --ANK
	 */
3266
	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3267

3268
	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3269 3270
	if (!neigh)
		return;
3271

L
Linus Torvalds 已提交
3272 3273 3274 3275
	/*
	 *	We have finally decided to accept it.
	 */

3276
	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
L
Linus Torvalds 已提交
3277 3278 3279
		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
		     NEIGH_UPDATE_F_OVERRIDE|
		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3280 3281
				     NEIGH_UPDATE_F_ISROUTER)),
		     NDISC_REDIRECT, &ndopts);
L
Linus Torvalds 已提交
3282

M
Martin KaFai Lau 已提交
3283
	nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
3284
	if (!nrt)
L
Linus Torvalds 已提交
3285 3286 3287 3288 3289 3290
		goto out;

	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
	if (on_link)
		nrt->rt6i_flags &= ~RTF_GATEWAY;

3291
	nrt->rt6i_protocol = RTPROT_REDIRECT;
A
Alexey Dobriyan 已提交
3292
	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
L
Linus Torvalds 已提交
3293

3294 3295 3296 3297
	/* No need to remove rt from the exception table if rt is
	 * a cached route because rt6_insert_exception() will
	 * takes care of it
	 */
3298
	if (rt6_insert_exception(nrt, rt->from)) {
3299 3300 3301
		dst_release_immediate(&nrt->dst);
		goto out;
	}
L
Linus Torvalds 已提交
3302

3303 3304
	netevent.old = &rt->dst;
	netevent.new = &nrt->dst;
3305
	netevent.daddr = &msg->dest;
3306
	netevent.neigh = neigh;
3307 3308
	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

L
Linus Torvalds 已提交
3309
out:
3310
	neigh_release(neigh);
3311 3312
}

3313
#ifdef CONFIG_IPV6_ROUTE_INFO
3314
static struct rt6_info *rt6_get_route_info(struct net *net,
3315
					   const struct in6_addr *prefix, int prefixlen,
3316 3317
					   const struct in6_addr *gwaddr,
					   struct net_device *dev)
3318
{
3319 3320
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
	int ifindex = dev->ifindex;
3321 3322
	struct fib6_node *fn;
	struct rt6_info *rt = NULL;
T
Thomas Graf 已提交
3323 3324
	struct fib6_table *table;

3325
	table = fib6_get_table(net, tb_id);
3326
	if (!table)
T
Thomas Graf 已提交
3327
		return NULL;
3328

3329
	rcu_read_lock();
3330
	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3331 3332 3333
	if (!fn)
		goto out;

3334
	for_each_fib6_node_rt_rcu(fn) {
3335
		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3336 3337 3338
			continue;
		if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
			continue;
3339
		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3340
			continue;
3341
		ip6_hold_safe(NULL, &rt, false);
3342 3343 3344
		break;
	}
out:
3345
	rcu_read_unlock();
3346 3347 3348
	return rt;
}

3349
static struct rt6_info *rt6_add_route_info(struct net *net,
3350
					   const struct in6_addr *prefix, int prefixlen,
3351 3352
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
3353
					   unsigned int pref)
3354
{
3355
	struct fib6_config cfg = {
3356
		.fc_metric	= IP6_RT_PRIO_USER,
3357
		.fc_ifindex	= dev->ifindex,
3358 3359 3360
		.fc_dst_len	= prefixlen,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
				  RTF_UP | RTF_PREF(pref),
3361
		.fc_protocol = RTPROT_RA,
3362
		.fc_type = RTN_UNICAST,
3363
		.fc_nlinfo.portid = 0,
3364 3365
		.fc_nlinfo.nlh = NULL,
		.fc_nlinfo.nl_net = net,
3366 3367
	};

3368
	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
A
Alexey Dobriyan 已提交
3369 3370
	cfg.fc_dst = *prefix;
	cfg.fc_gateway = *gwaddr;
3371

3372 3373
	/* We should treat it as a default route if prefix length is 0. */
	if (!prefixlen)
3374
		cfg.fc_flags |= RTF_DEFAULT;
3375

3376
	ip6_route_add(&cfg, NULL);
3377

3378
	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3379 3380 3381
}
#endif

3382 3383 3384
struct rt6_info *rt6_get_dflt_router(struct net *net,
				     const struct in6_addr *addr,
				     struct net_device *dev)
3385
{
3386
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
L
Linus Torvalds 已提交
3387
	struct rt6_info *rt;
T
Thomas Graf 已提交
3388
	struct fib6_table *table;
L
Linus Torvalds 已提交
3389

3390
	table = fib6_get_table(net, tb_id);
3391
	if (!table)
T
Thomas Graf 已提交
3392
		return NULL;
L
Linus Torvalds 已提交
3393

3394 3395
	rcu_read_lock();
	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3396
		if (dev == rt->fib6_nh.nh_dev &&
3397
		    ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3398
		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
L
Linus Torvalds 已提交
3399 3400 3401
			break;
	}
	if (rt)
3402
		ip6_hold_safe(NULL, &rt, false);
3403
	rcu_read_unlock();
L
Linus Torvalds 已提交
3404 3405 3406
	return rt;
}

3407 3408
struct rt6_info *rt6_add_dflt_router(struct net *net,
				     const struct in6_addr *gwaddr,
3409 3410
				     struct net_device *dev,
				     unsigned int pref)
L
Linus Torvalds 已提交
3411
{
3412
	struct fib6_config cfg = {
D
David Ahern 已提交
3413
		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3414
		.fc_metric	= IP6_RT_PRIO_USER,
3415 3416 3417
		.fc_ifindex	= dev->ifindex,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3418
		.fc_protocol = RTPROT_RA,
3419
		.fc_type = RTN_UNICAST,
3420
		.fc_nlinfo.portid = 0,
3421
		.fc_nlinfo.nlh = NULL,
3422
		.fc_nlinfo.nl_net = net,
3423
	};
L
Linus Torvalds 已提交
3424

A
Alexey Dobriyan 已提交
3425
	cfg.fc_gateway = *gwaddr;
L
Linus Torvalds 已提交
3426

3427
	if (!ip6_route_add(&cfg, NULL)) {
3428 3429 3430 3431 3432 3433
		struct fib6_table *table;

		table = fib6_get_table(dev_net(dev), cfg.fc_table);
		if (table)
			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
	}
L
Linus Torvalds 已提交
3434

3435
	return rt6_get_dflt_router(net, gwaddr, dev);
L
Linus Torvalds 已提交
3436 3437
}

3438 3439
static void __rt6_purge_dflt_routers(struct net *net,
				     struct fib6_table *table)
L
Linus Torvalds 已提交
3440 3441 3442 3443
{
	struct rt6_info *rt;

restart:
3444 3445
	rcu_read_lock();
	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3446 3447
		if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
		    (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
3448
			if (dst_hold_safe(&rt->dst)) {
3449
				rcu_read_unlock();
3450
				ip6_del_rt(net, rt);
3451
			} else {
3452
				rcu_read_unlock();
3453
			}
L
Linus Torvalds 已提交
3454 3455 3456
			goto restart;
		}
	}
3457
	rcu_read_unlock();
3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473

	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}

void rt6_purge_dflt_routers(struct net *net)
{
	struct fib6_table *table;
	struct hlist_head *head;
	unsigned int h;

	rcu_read_lock();

	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
		head = &net->ipv6.fib_table_hash[h];
		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3474
				__rt6_purge_dflt_routers(net, table);
3475 3476 3477 3478
		}
	}

	rcu_read_unlock();
L
Linus Torvalds 已提交
3479 3480
}

3481 3482
static void rtmsg_to_fib6_config(struct net *net,
				 struct in6_rtmsg *rtmsg,
3483 3484 3485 3486
				 struct fib6_config *cfg)
{
	memset(cfg, 0, sizeof(*cfg));

D
David Ahern 已提交
3487 3488
	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
			 : RT6_TABLE_MAIN;
3489 3490 3491 3492 3493 3494
	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
	cfg->fc_metric = rtmsg->rtmsg_metric;
	cfg->fc_expires = rtmsg->rtmsg_info;
	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
	cfg->fc_src_len = rtmsg->rtmsg_src_len;
	cfg->fc_flags = rtmsg->rtmsg_flags;
3495
	cfg->fc_type = rtmsg->rtmsg_type;
3496

3497
	cfg->fc_nlinfo.nl_net = net;
3498

A
Alexey Dobriyan 已提交
3499 3500 3501
	cfg->fc_dst = rtmsg->rtmsg_dst;
	cfg->fc_src = rtmsg->rtmsg_src;
	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3502 3503
}

3504
int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
3505
{
3506
	struct fib6_config cfg;
L
Linus Torvalds 已提交
3507 3508 3509
	struct in6_rtmsg rtmsg;
	int err;

3510
	switch (cmd) {
L
Linus Torvalds 已提交
3511 3512
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
3513
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
3514 3515 3516 3517 3518
			return -EPERM;
		err = copy_from_user(&rtmsg, arg,
				     sizeof(struct in6_rtmsg));
		if (err)
			return -EFAULT;
3519

3520
		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3521

L
Linus Torvalds 已提交
3522 3523 3524
		rtnl_lock();
		switch (cmd) {
		case SIOCADDRT:
3525
			err = ip6_route_add(&cfg, NULL);
L
Linus Torvalds 已提交
3526 3527
			break;
		case SIOCDELRT:
3528
			err = ip6_route_del(&cfg, NULL);
L
Linus Torvalds 已提交
3529 3530 3531 3532 3533 3534 3535
			break;
		default:
			err = -EINVAL;
		}
		rtnl_unlock();

		return err;
3536
	}
L
Linus Torvalds 已提交
3537 3538 3539 3540 3541 3542 3543 3544

	return -EINVAL;
}

/*
 *	Drop the packet on the floor
 */

3545
static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
L
Linus Torvalds 已提交
3546
{
3547
	int type;
E
Eric Dumazet 已提交
3548
	struct dst_entry *dst = skb_dst(skb);
3549 3550
	switch (ipstats_mib_noroutes) {
	case IPSTATS_MIB_INNOROUTES:
3551
		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
U
Ulrich Weber 已提交
3552
		if (type == IPV6_ADDR_ANY) {
3553 3554
			IP6_INC_STATS(dev_net(dst->dev),
				      __in6_dev_get_safely(skb->dev),
3555
				      IPSTATS_MIB_INADDRERRORS);
3556 3557 3558 3559
			break;
		}
		/* FALLTHROUGH */
	case IPSTATS_MIB_OUTNOROUTES:
3560 3561
		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
			      ipstats_mib_noroutes);
3562 3563
		break;
	}
3564
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
L
Linus Torvalds 已提交
3565 3566 3567 3568
	kfree_skb(skb);
	return 0;
}

3569 3570
static int ip6_pkt_discard(struct sk_buff *skb)
{
3571
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3572 3573
}

E
Eric W. Biederman 已提交
3574
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
3575
{
E
Eric Dumazet 已提交
3576
	skb->dev = skb_dst(skb)->dev;
3577
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
3578 3579
}

3580 3581
static int ip6_pkt_prohibit(struct sk_buff *skb)
{
3582
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3583 3584
}

E
Eric W. Biederman 已提交
3585
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3586
{
E
Eric Dumazet 已提交
3587
	skb->dev = skb_dst(skb)->dev;
3588
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3589 3590
}

L
Linus Torvalds 已提交
3591 3592 3593 3594
/*
 *	Allocate a dst for local (unicast / anycast) address.
 */

3595 3596
struct rt6_info *addrconf_dst_alloc(struct net *net,
				    struct inet6_dev *idev,
L
Linus Torvalds 已提交
3597
				    const struct in6_addr *addr,
3598
				    bool anycast)
L
Linus Torvalds 已提交
3599
{
D
David Ahern 已提交
3600
	u32 tb_id;
3601
	struct net_device *dev = idev->dev;
3602 3603 3604
	struct rt6_info *rt;

	rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
3605
	if (!rt)
L
Linus Torvalds 已提交
3606 3607 3608 3609 3610
		return ERR_PTR(-ENOMEM);

	in6_dev_hold(idev);
	rt->rt6i_idev = idev;

3611
	rt->dst.flags |= DST_HOST;
3612
	rt->rt6i_protocol = RTPROT_KERNEL;
L
Linus Torvalds 已提交
3613
	rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
3614 3615
	if (anycast) {
		rt->fib6_type = RTN_ANYCAST;
3616
		rt->rt6i_flags |= RTF_ANYCAST;
3617 3618
	} else {
		rt->fib6_type = RTN_LOCAL;
L
Linus Torvalds 已提交
3619
		rt->rt6i_flags |= RTF_LOCAL;
3620
	}
L
Linus Torvalds 已提交
3621

3622 3623
	rt->fib6_nh.nh_gw = *addr;
	rt->fib6_nh.nh_dev = dev;
3624
	rt->rt6i_gateway  = *addr;
A
Alexey Dobriyan 已提交
3625
	rt->rt6i_dst.addr = *addr;
L
Linus Torvalds 已提交
3626
	rt->rt6i_dst.plen = 128;
D
David Ahern 已提交
3627 3628
	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
	rt->rt6i_table = fib6_get_table(net, tb_id);
L
Linus Torvalds 已提交
3629 3630 3631 3632

	return rt;
}

3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645
/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
	struct net_device *dev;
	struct net *net;
	struct in6_addr *addr;
};

static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
{
	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

3646
	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
D
David Ahern 已提交
3647
	    rt != net->ipv6.fib6_null_entry &&
3648
	    ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
3649
		spin_lock_bh(&rt6_exception_lock);
3650 3651
		/* remove prefsrc entry */
		rt->rt6i_prefsrc.plen = 0;
3652 3653 3654
		/* need to update cache as well */
		rt6_exceptions_remove_prefsrc(rt);
		spin_unlock_bh(&rt6_exception_lock);
3655 3656 3657 3658 3659 3660 3661 3662 3663 3664 3665 3666
	}
	return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
	struct net *net = dev_net(ifp->idev->dev);
	struct arg_dev_net_ip adni = {
		.dev = ifp->idev->dev,
		.net = net,
		.addr = &ifp->addr,
	};
3667
	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3668 3669
}

3670 3671 3672 3673 3674 3675 3676
#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)

/* Remove routers and update dst entries when gateway turn into host. */
static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
{
	struct in6_addr *gateway = (struct in6_addr *)arg;

3677
	if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3678
	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3679 3680
		return -1;
	}
3681 3682 3683 3684 3685 3686 3687

	/* Further clean up cached routes in exception table.
	 * This is needed because cached route may have a different
	 * gateway than its 'parent' in the case of an ip redirect.
	 */
	rt6_exceptions_clean_tohost(rt, gateway);

3688 3689 3690 3691 3692 3693 3694 3695
	return 0;
}

void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
{
	fib6_clean_all(net, fib6_clean_tohost, gateway);
}

3696 3697
struct arg_netdev_event {
	const struct net_device *dev;
3698 3699 3700 3701
	union {
		unsigned int nh_flags;
		unsigned long event;
	};
3702 3703
};

3704 3705 3706 3707 3708 3709 3710 3711 3712 3713 3714 3715 3716 3717 3718 3719 3720 3721 3722 3723 3724 3725
static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
{
	struct rt6_info *iter;
	struct fib6_node *fn;

	fn = rcu_dereference_protected(rt->rt6i_node,
			lockdep_is_held(&rt->rt6i_table->tb6_lock));
	iter = rcu_dereference_protected(fn->leaf,
			lockdep_is_held(&rt->rt6i_table->tb6_lock));
	while (iter) {
		if (iter->rt6i_metric == rt->rt6i_metric &&
		    rt6_qualify_for_ecmp(iter))
			return iter;
		iter = rcu_dereference_protected(iter->rt6_next,
				lockdep_is_held(&rt->rt6i_table->tb6_lock));
	}

	return NULL;
}

static bool rt6_is_dead(const struct rt6_info *rt)
{
3726 3727
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3728 3729 3730 3731 3732 3733 3734 3735 3736 3737 3738 3739
	     rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
		return true;

	return false;
}

static int rt6_multipath_total_weight(const struct rt6_info *rt)
{
	struct rt6_info *iter;
	int total = 0;

	if (!rt6_is_dead(rt))
3740
		total += rt->fib6_nh.nh_weight;
3741 3742 3743

	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
		if (!rt6_is_dead(iter))
3744
			total += iter->fib6_nh.nh_weight;
3745 3746 3747 3748 3749 3750 3751 3752 3753 3754
	}

	return total;
}

static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
{
	int upper_bound = -1;

	if (!rt6_is_dead(rt)) {
3755
		*weight += rt->fib6_nh.nh_weight;
3756 3757 3758
		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
						    total) - 1;
	}
3759
	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 3782 3783 3784 3785 3786 3787 3788 3789 3790 3791 3792 3793 3794 3795 3796
}

static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
{
	struct rt6_info *iter;
	int weight = 0;

	rt6_upper_bound_set(rt, &weight, total);

	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
		rt6_upper_bound_set(iter, &weight, total);
}

void rt6_multipath_rebalance(struct rt6_info *rt)
{
	struct rt6_info *first;
	int total;

	/* In case the entire multipath route was marked for flushing,
	 * then there is no need to rebalance upon the removal of every
	 * sibling route.
	 */
	if (!rt->rt6i_nsiblings || rt->should_flush)
		return;

	/* During lookup routes are evaluated in order, so we need to
	 * make sure upper bounds are assigned from the first sibling
	 * onwards.
	 */
	first = rt6_multipath_first_sibling(rt);
	if (WARN_ON_ONCE(!first))
		return;

	total = rt6_multipath_total_weight(first);
	rt6_multipath_upper_bound_set(first, total);
}

3797 3798 3799
static int fib6_ifup(struct rt6_info *rt, void *p_arg)
{
	const struct arg_netdev_event *arg = p_arg;
3800
	struct net *net = dev_net(arg->dev);
3801

D
David Ahern 已提交
3802
	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3803
		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3804
		fib6_update_sernum_upto_root(net, rt);
3805
		rt6_multipath_rebalance(rt);
3806
	}
3807 3808 3809 3810 3811 3812 3813 3814

	return 0;
}

void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
{
	struct arg_netdev_event arg = {
		.dev = dev,
I
Ido Schimmel 已提交
3815 3816 3817
		{
			.nh_flags = nh_flags,
		},
3818 3819 3820 3821 3822 3823 3824 3825
	};

	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
		arg.nh_flags |= RTNH_F_LINKDOWN;

	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}

3826 3827 3828 3829 3830
static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
				   const struct net_device *dev)
{
	struct rt6_info *iter;

3831
	if (rt->fib6_nh.nh_dev == dev)
3832 3833
		return true;
	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3834
		if (iter->fib6_nh.nh_dev == dev)
3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854
			return true;

	return false;
}

static void rt6_multipath_flush(struct rt6_info *rt)
{
	struct rt6_info *iter;

	rt->should_flush = 1;
	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
		iter->should_flush = 1;
}

static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
					     const struct net_device *down_dev)
{
	struct rt6_info *iter;
	unsigned int dead = 0;

3855 3856
	if (rt->fib6_nh.nh_dev == down_dev ||
	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3857 3858
		dead++;
	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3859 3860
		if (iter->fib6_nh.nh_dev == down_dev ||
		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871
			dead++;

	return dead;
}

static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
				       const struct net_device *dev,
				       unsigned int nh_flags)
{
	struct rt6_info *iter;

3872 3873
	if (rt->fib6_nh.nh_dev == dev)
		rt->fib6_nh.nh_flags |= nh_flags;
3874
	list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3875 3876
		if (iter->fib6_nh.nh_dev == dev)
			iter->fib6_nh.nh_flags |= nh_flags;
3877 3878
}

3879
/* called with write lock held for table with rt */
3880
static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
3881
{
3882 3883
	const struct arg_netdev_event *arg = p_arg;
	const struct net_device *dev = arg->dev;
3884
	struct net *net = dev_net(dev);
3885

D
David Ahern 已提交
3886
	if (rt == net->ipv6.fib6_null_entry)
3887 3888 3889 3890
		return 0;

	switch (arg->event) {
	case NETDEV_UNREGISTER:
3891
		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3892
	case NETDEV_DOWN:
3893
		if (rt->should_flush)
3894
			return -1;
3895
		if (!rt->rt6i_nsiblings)
3896
			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3897 3898 3899 3900 3901 3902 3903 3904 3905 3906
		if (rt6_multipath_uses_dev(rt, dev)) {
			unsigned int count;

			count = rt6_multipath_dead_count(rt, dev);
			if (rt->rt6i_nsiblings + 1 == count) {
				rt6_multipath_flush(rt);
				return -1;
			}
			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
						   RTNH_F_LINKDOWN);
3907
			fib6_update_sernum(net, rt);
3908
			rt6_multipath_rebalance(rt);
3909 3910
		}
		return -2;
3911
	case NETDEV_CHANGE:
3912
		if (rt->fib6_nh.nh_dev != dev ||
3913
		    rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
3914
			break;
3915
		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3916
		rt6_multipath_rebalance(rt);
3917
		break;
3918
	}
3919

L
Linus Torvalds 已提交
3920 3921 3922
	return 0;
}

3923
void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
L
Linus Torvalds 已提交
3924
{
3925
	struct arg_netdev_event arg = {
3926
		.dev = dev,
I
Ido Schimmel 已提交
3927 3928 3929
		{
			.event = event,
		},
3930 3931
	};

3932 3933 3934 3935 3936 3937 3938 3939
	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
}

void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
	rt6_sync_down_dev(dev, event);
	rt6_uncached_list_flush_dev(dev_net(dev), dev);
	neigh_ifdown(&nd_tbl, dev);
L
Linus Torvalds 已提交
3940 3941
}

3942
struct rt6_mtu_change_arg {
L
Linus Torvalds 已提交
3943
	struct net_device *dev;
3944
	unsigned int mtu;
L
Linus Torvalds 已提交
3945 3946 3947 3948 3949 3950 3951 3952 3953 3954 3955 3956 3957 3958
};

static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
{
	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
	struct inet6_dev *idev;

	/* In IPv6 pmtu discovery is not optional,
	   so that RTAX_MTU lock cannot disable it.
	   We still use this lock to block changes
	   caused by addrconf/ndisc.
	*/

	idev = __in6_dev_get(arg->dev);
3959
	if (!idev)
L
Linus Torvalds 已提交
3960 3961 3962 3963 3964 3965 3966
		return 0;

	/* For administrative MTU increase, there is no way to discover
	   IPv6 PMTU increase, so PMTU increase should be updated here.
	   Since RFC 1981 doesn't include administrative MTU increase
	   update PMTU increase is a MUST. (i.e. jumbo frame)
	 */
3967
	if (rt->fib6_nh.nh_dev == arg->dev &&
3968 3969 3970 3971 3972 3973 3974
	    !fib6_metric_locked(rt, RTAX_MTU)) {
		u32 mtu = rt->fib6_pmtu;

		if (mtu >= arg->mtu ||
		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
			fib6_metric_set(rt, RTAX_MTU, arg->mtu);

3975
		spin_lock_bh(&rt6_exception_lock);
3976
		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
3977
		spin_unlock_bh(&rt6_exception_lock);
3978
	}
L
Linus Torvalds 已提交
3979 3980 3981
	return 0;
}

3982
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
L
Linus Torvalds 已提交
3983
{
T
Thomas Graf 已提交
3984 3985 3986 3987
	struct rt6_mtu_change_arg arg = {
		.dev = dev,
		.mtu = mtu,
	};
L
Linus Torvalds 已提交
3988

3989
	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
L
Linus Torvalds 已提交
3990 3991
}

3992
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
3993
	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
3994
	[RTA_OIF]               = { .type = NLA_U32 },
3995
	[RTA_IIF]		= { .type = NLA_U32 },
3996 3997
	[RTA_PRIORITY]          = { .type = NLA_U32 },
	[RTA_METRICS]           = { .type = NLA_NESTED },
3998
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
3999
	[RTA_PREF]              = { .type = NLA_U8 },
4000 4001
	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
	[RTA_ENCAP]		= { .type = NLA_NESTED },
4002
	[RTA_EXPIRES]		= { .type = NLA_U32 },
4003
	[RTA_UID]		= { .type = NLA_U32 },
4004
	[RTA_MARK]		= { .type = NLA_U32 },
4005 4006 4007
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4008 4009
			      struct fib6_config *cfg,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4010
{
4011 4012
	struct rtmsg *rtm;
	struct nlattr *tb[RTA_MAX+1];
4013
	unsigned int pref;
4014
	int err;
L
Linus Torvalds 已提交
4015

4016 4017
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
			  NULL);
4018 4019
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
4020

4021 4022 4023 4024 4025 4026 4027 4028 4029
	err = -EINVAL;
	rtm = nlmsg_data(nlh);
	memset(cfg, 0, sizeof(*cfg));

	cfg->fc_table = rtm->rtm_table;
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_src_len = rtm->rtm_src_len;
	cfg->fc_flags = RTF_UP;
	cfg->fc_protocol = rtm->rtm_protocol;
4030
	cfg->fc_type = rtm->rtm_type;
4031

4032 4033
	if (rtm->rtm_type == RTN_UNREACHABLE ||
	    rtm->rtm_type == RTN_BLACKHOLE ||
4034 4035
	    rtm->rtm_type == RTN_PROHIBIT ||
	    rtm->rtm_type == RTN_THROW)
4036 4037
		cfg->fc_flags |= RTF_REJECT;

4038 4039 4040
	if (rtm->rtm_type == RTN_LOCAL)
		cfg->fc_flags |= RTF_LOCAL;

4041 4042 4043
	if (rtm->rtm_flags & RTM_F_CLONED)
		cfg->fc_flags |= RTF_CACHE;

4044 4045
	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);

4046
	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4047
	cfg->fc_nlinfo.nlh = nlh;
4048
	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4049 4050

	if (tb[RTA_GATEWAY]) {
4051
		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4052
		cfg->fc_flags |= RTF_GATEWAY;
L
Linus Torvalds 已提交
4053
	}
4054 4055 4056 4057 4058 4059 4060 4061

	if (tb[RTA_DST]) {
		int plen = (rtm->rtm_dst_len + 7) >> 3;

		if (nla_len(tb[RTA_DST]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
L
Linus Torvalds 已提交
4062
	}
4063 4064 4065 4066 4067 4068 4069 4070

	if (tb[RTA_SRC]) {
		int plen = (rtm->rtm_src_len + 7) >> 3;

		if (nla_len(tb[RTA_SRC]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
L
Linus Torvalds 已提交
4071
	}
4072

4073
	if (tb[RTA_PREFSRC])
4074
		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4075

4076 4077 4078 4079 4080 4081 4082 4083 4084
	if (tb[RTA_OIF])
		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

	if (tb[RTA_PRIORITY])
		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

	if (tb[RTA_METRICS]) {
		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
L
Linus Torvalds 已提交
4085
	}
4086 4087 4088 4089

	if (tb[RTA_TABLE])
		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

4090 4091 4092
	if (tb[RTA_MULTIPATH]) {
		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4093 4094

		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4095
						     cfg->fc_mp_len, extack);
4096 4097
		if (err < 0)
			goto errout;
4098 4099
	}

4100 4101 4102 4103 4104 4105 4106 4107
	if (tb[RTA_PREF]) {
		pref = nla_get_u8(tb[RTA_PREF]);
		if (pref != ICMPV6_ROUTER_PREF_LOW &&
		    pref != ICMPV6_ROUTER_PREF_HIGH)
			pref = ICMPV6_ROUTER_PREF_MEDIUM;
		cfg->fc_flags |= RTF_PREF(pref);
	}

4108 4109 4110
	if (tb[RTA_ENCAP])
		cfg->fc_encap = tb[RTA_ENCAP];

4111
	if (tb[RTA_ENCAP_TYPE]) {
4112 4113
		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);

4114
		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4115 4116 4117 4118
		if (err < 0)
			goto errout;
	}

4119 4120 4121 4122 4123 4124 4125 4126 4127
	if (tb[RTA_EXPIRES]) {
		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);

		if (addrconf_finite_timeout(timeout)) {
			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
			cfg->fc_flags |= RTF_EXPIRES;
		}
	}

4128 4129 4130
	err = 0;
errout:
	return err;
L
Linus Torvalds 已提交
4131 4132
}

4133 4134 4135 4136 4137 4138 4139 4140 4141 4142 4143
struct rt6_nh {
	struct rt6_info *rt6_info;
	struct fib6_config r_cfg;
	struct list_head next;
};

static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
{
	struct rt6_nh *nh;

	list_for_each_entry(nh, rt6_nh_list, next) {
4144
		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4145 4146 4147 4148 4149
		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
		        nh->r_cfg.fc_ifindex);
	}
}

4150 4151
static int ip6_route_info_append(struct net *net,
				 struct list_head *rt6_nh_list,
4152 4153 4154 4155 4156 4157 4158
				 struct rt6_info *rt, struct fib6_config *r_cfg)
{
	struct rt6_nh *nh;
	int err = -EEXIST;

	list_for_each_entry(nh, rt6_nh_list, next) {
		/* check if rt6_info already exists */
4159
		if (rt6_duplicate_nexthop(nh->rt6_info, rt))
4160 4161 4162 4163 4164 4165 4166
			return err;
	}

	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
	if (!nh)
		return -ENOMEM;
	nh->rt6_info = rt;
4167
	err = ip6_convert_metrics(net, rt, r_cfg);
4168 4169 4170 4171 4172 4173 4174 4175 4176 4177
	if (err) {
		kfree(nh);
		return err;
	}
	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
	list_add_tail(&nh->next, rt6_nh_list);

	return 0;
}

4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198
static void ip6_route_mpath_notify(struct rt6_info *rt,
				   struct rt6_info *rt_last,
				   struct nl_info *info,
				   __u16 nlflags)
{
	/* if this is an APPEND route, then rt points to the first route
	 * inserted and rt_last points to last route inserted. Userspace
	 * wants a consistent dump of the route which starts at the first
	 * nexthop. Since sibling routes are always added at the end of
	 * the list, find the first sibling of the last route appended
	 */
	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
		rt = list_first_entry(&rt_last->rt6i_siblings,
				      struct rt6_info,
				      rt6i_siblings);
	}

	if (rt)
		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
}

4199 4200
static int ip6_route_multipath_add(struct fib6_config *cfg,
				   struct netlink_ext_ack *extack)
4201
{
4202 4203
	struct rt6_info *rt_notif = NULL, *rt_last = NULL;
	struct nl_info *info = &cfg->fc_nlinfo;
4204 4205
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
4206 4207 4208
	struct rt6_info *rt;
	struct rt6_nh *err_nh;
	struct rt6_nh *nh, *nh_safe;
4209
	__u16 nlflags;
4210 4211
	int remaining;
	int attrlen;
4212 4213 4214 4215 4216
	int err = 1;
	int nhn = 0;
	int replace = (cfg->fc_nlinfo.nlh &&
		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
	LIST_HEAD(rt6_nh_list);
4217

4218 4219 4220 4221
	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
		nlflags |= NLM_F_APPEND;

4222
	remaining = cfg->fc_mp_len;
4223 4224
	rtnh = (struct rtnexthop *)cfg->fc_mp;

4225 4226 4227
	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
	 * rt6_info structs per nexthop
	 */
4228 4229 4230 4231 4232 4233 4234 4235 4236 4237 4238
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
4239
				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4240 4241
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
4242 4243 4244 4245
			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
			if (nla)
				r_cfg.fc_encap_type = nla_get_u16(nla);
4246
		}
4247

4248
		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4249
		rt = ip6_route_info_create(&r_cfg, extack);
4250 4251 4252
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
			rt = NULL;
4253
			goto cleanup;
4254
		}
4255

4256
		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4257

4258 4259
		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
					    rt, &r_cfg);
4260
		if (err) {
4261
			dst_release_immediate(&rt->dst);
4262 4263 4264 4265 4266 4267
			goto cleanup;
		}

		rtnh = rtnh_next(rtnh, &remaining);
	}

4268 4269 4270 4271 4272 4273
	/* for add and replace send one notification with all nexthops.
	 * Skip the notification in fib6_add_rt2node and send one with
	 * the full route when done
	 */
	info->skip_notify = 1;

4274 4275
	err_nh = NULL;
	list_for_each_entry(nh, &rt6_nh_list, next) {
4276
		rt_last = nh->rt6_info;
4277
		err = __ip6_ins_rt(nh->rt6_info, info, extack);
4278 4279 4280 4281
		/* save reference to first route for notification */
		if (!rt_notif && !err)
			rt_notif = nh->rt6_info;

4282 4283 4284 4285 4286 4287 4288
		/* nh->rt6_info is used or freed at this point, reset to NULL*/
		nh->rt6_info = NULL;
		if (err) {
			if (replace && nhn)
				ip6_print_replace_route_err(&rt6_nh_list);
			err_nh = nh;
			goto add_errout;
4289
		}
4290

4291
		/* Because each route is added like a single route we remove
4292 4293 4294 4295 4296
		 * these flags after the first nexthop: if there is a collision,
		 * we have already failed to add the first nexthop:
		 * fib6_add_rt2node() has rejected it; when replacing, old
		 * nexthops have been replaced by first new, the rest should
		 * be added to it.
4297
		 */
4298 4299
		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
						     NLM_F_REPLACE);
4300 4301 4302
		nhn++;
	}

4303 4304
	/* success ... tell user about new route */
	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4305 4306 4307
	goto cleanup;

add_errout:
4308 4309 4310 4311 4312 4313 4314
	/* send notification for routes that were added so that
	 * the delete notifications sent by ip6_route_del are
	 * coherent
	 */
	if (rt_notif)
		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);

4315 4316 4317 4318
	/* Delete routes that were already added */
	list_for_each_entry(nh, &rt6_nh_list, next) {
		if (err_nh == nh)
			break;
4319
		ip6_route_del(&nh->r_cfg, extack);
4320 4321 4322 4323
	}

cleanup:
	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4324 4325
		if (nh->rt6_info)
			dst_release_immediate(&nh->rt6_info->dst);
4326 4327 4328 4329 4330 4331 4332
		list_del(&nh->next);
		kfree(nh);
	}

	return err;
}

4333 4334
static int ip6_route_multipath_del(struct fib6_config *cfg,
				   struct netlink_ext_ack *extack)
4335 4336 4337 4338 4339 4340 4341 4342 4343 4344 4345 4346 4347 4348 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360
{
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
	int remaining;
	int attrlen;
	int err = 1, last_err = 0;

	remaining = cfg->fc_mp_len;
	rtnh = (struct rtnexthop *)cfg->fc_mp;

	/* Parse a Multipath Entry */
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
		}
4361
		err = ip6_route_del(&r_cfg, extack);
4362 4363 4364
		if (err)
			last_err = err;

4365 4366 4367 4368 4369 4370
		rtnh = rtnh_next(rtnh, &remaining);
	}

	return last_err;
}

4371 4372
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4373
{
4374 4375
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
4376

4377
	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4378 4379 4380
	if (err < 0)
		return err;

4381
	if (cfg.fc_mp)
4382
		return ip6_route_multipath_del(&cfg, extack);
4383 4384
	else {
		cfg.fc_delete_all_nh = 1;
4385
		return ip6_route_del(&cfg, extack);
4386
	}
L
Linus Torvalds 已提交
4387 4388
}

4389 4390
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4391
{
4392 4393
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
4394

4395
	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4396 4397 4398
	if (err < 0)
		return err;

4399
	if (cfg.fc_mp)
4400
		return ip6_route_multipath_add(&cfg, extack);
4401
	else
4402
		return ip6_route_add(&cfg, extack);
L
Linus Torvalds 已提交
4403 4404
}

4405
static size_t rt6_nlmsg_size(struct rt6_info *rt)
4406
{
4407 4408 4409 4410 4411 4412
	int nexthop_len = 0;

	if (rt->rt6i_nsiblings) {
		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
			    + NLA_ALIGN(sizeof(struct rtnexthop))
			    + nla_total_size(16) /* RTA_GATEWAY */
4413
			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4414 4415 4416 4417

		nexthop_len *= rt->rt6i_nsiblings;
	}

4418 4419 4420 4421 4422 4423 4424 4425 4426
	return NLMSG_ALIGN(sizeof(struct rtmsg))
	       + nla_total_size(16) /* RTA_SRC */
	       + nla_total_size(16) /* RTA_DST */
	       + nla_total_size(16) /* RTA_GATEWAY */
	       + nla_total_size(16) /* RTA_PREFSRC */
	       + nla_total_size(4) /* RTA_TABLE */
	       + nla_total_size(4) /* RTA_IIF */
	       + nla_total_size(4) /* RTA_OIF */
	       + nla_total_size(4) /* RTA_PRIORITY */
4427
	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4428
	       + nla_total_size(sizeof(struct rta_cacheinfo))
4429
	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4430
	       + nla_total_size(1) /* RTA_PREF */
4431
	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4432 4433 4434 4435
	       + nexthop_len;
}

static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
4436
			    unsigned int *flags, bool skip_oif)
4437
{
4438
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4439 4440
		*flags |= RTNH_F_DEAD;

4441
	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4442 4443 4444 4445 4446 4447
		*flags |= RTNH_F_LINKDOWN;
		if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
			*flags |= RTNH_F_DEAD;
	}

	if (rt->rt6i_flags & RTF_GATEWAY) {
4448
		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4449 4450 4451
			goto nla_put_failure;
	}

4452 4453
	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4454 4455
		*flags |= RTNH_F_OFFLOAD;

4456
	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4457 4458
	if (!skip_oif && rt->fib6_nh.nh_dev &&
	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4459 4460
		goto nla_put_failure;

4461 4462
	if (rt->fib6_nh.nh_lwtstate &&
	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4463 4464 4465 4466 4467 4468 4469 4470
		goto nla_put_failure;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

4471
/* add multipath next hop */
4472 4473
static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
{
4474
	const struct net_device *dev = rt->fib6_nh.nh_dev;
4475 4476 4477 4478 4479 4480 4481
	struct rtnexthop *rtnh;
	unsigned int flags = 0;

	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
	if (!rtnh)
		goto nla_put_failure;

4482 4483
	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4484

4485
	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4486 4487 4488 4489 4490 4491 4492 4493 4494 4495 4496
		goto nla_put_failure;

	rtnh->rtnh_flags = flags;

	/* length of rtnetlink header + attributes */
	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
4497 4498
}

4499 4500 4501
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
			 struct rt6_info *rt, struct dst_entry *dst,
			 struct in6_addr *dest, struct in6_addr *src,
4502
			 int iif, int type, u32 portid, u32 seq,
4503
			 unsigned int flags)
L
Linus Torvalds 已提交
4504 4505
{
	struct rtmsg *rtm;
4506
	struct nlmsghdr *nlh;
4507 4508
	long expires = 0;
	u32 *pmetrics;
4509
	u32 table;
L
Linus Torvalds 已提交
4510

4511
	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4512
	if (!nlh)
4513
		return -EMSGSIZE;
4514 4515

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
4516 4517 4518 4519
	rtm->rtm_family = AF_INET6;
	rtm->rtm_dst_len = rt->rt6i_dst.plen;
	rtm->rtm_src_len = rt->rt6i_src.plen;
	rtm->rtm_tos = 0;
T
Thomas Graf 已提交
4520
	if (rt->rt6i_table)
4521
		table = rt->rt6i_table->tb6_id;
T
Thomas Graf 已提交
4522
	else
4523 4524
		table = RT6_TABLE_UNSPEC;
	rtm->rtm_table = table;
D
David S. Miller 已提交
4525 4526
	if (nla_put_u32(skb, RTA_TABLE, table))
		goto nla_put_failure;
4527 4528

	rtm->rtm_type = rt->fib6_type;
L
Linus Torvalds 已提交
4529 4530 4531 4532
	rtm->rtm_flags = 0;
	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
	rtm->rtm_protocol = rt->rt6i_protocol;

4533
	if (rt->rt6i_flags & RTF_CACHE)
L
Linus Torvalds 已提交
4534 4535
		rtm->rtm_flags |= RTM_F_CLONED;

4536 4537
	if (dest) {
		if (nla_put_in6_addr(skb, RTA_DST, dest))
D
David S. Miller 已提交
4538
			goto nla_put_failure;
4539
		rtm->rtm_dst_len = 128;
L
Linus Torvalds 已提交
4540
	} else if (rtm->rtm_dst_len)
4541
		if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
D
David S. Miller 已提交
4542
			goto nla_put_failure;
L
Linus Torvalds 已提交
4543 4544
#ifdef CONFIG_IPV6_SUBTREES
	if (src) {
4545
		if (nla_put_in6_addr(skb, RTA_SRC, src))
D
David S. Miller 已提交
4546
			goto nla_put_failure;
4547
		rtm->rtm_src_len = 128;
D
David S. Miller 已提交
4548
	} else if (rtm->rtm_src_len &&
4549
		   nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
D
David S. Miller 已提交
4550
		goto nla_put_failure;
L
Linus Torvalds 已提交
4551
#endif
4552 4553 4554
	if (iif) {
#ifdef CONFIG_IPV6_MROUTE
		if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
4555 4556 4557 4558 4559 4560
			int err = ip6mr_get_route(net, skb, rtm, portid);

			if (err == 0)
				return 0;
			if (err < 0)
				goto nla_put_failure;
4561 4562
		} else
#endif
D
David S. Miller 已提交
4563 4564
			if (nla_put_u32(skb, RTA_IIF, iif))
				goto nla_put_failure;
4565
	} else if (dest) {
L
Linus Torvalds 已提交
4566
		struct in6_addr saddr_buf;
4567
		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4568
		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
4569
			goto nla_put_failure;
L
Linus Torvalds 已提交
4570
	}
4571

4572 4573
	if (rt->rt6i_prefsrc.plen) {
		struct in6_addr saddr_buf;
A
Alexey Dobriyan 已提交
4574
		saddr_buf = rt->rt6i_prefsrc.addr;
4575
		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
4576
			goto nla_put_failure;
4577 4578
	}

4579 4580
	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4581 4582
		goto nla_put_failure;

D
David S. Miller 已提交
4583 4584
	if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
		goto nla_put_failure;
4585

4586 4587 4588 4589 4590 4591 4592 4593 4594 4595 4596 4597 4598 4599 4600 4601 4602 4603 4604 4605 4606 4607
	/* For multipath routes, walk the siblings list and add
	 * each as a nexthop within RTA_MULTIPATH.
	 */
	if (rt->rt6i_nsiblings) {
		struct rt6_info *sibling, *next_sibling;
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
		if (!mp)
			goto nla_put_failure;

		if (rt6_add_nexthop(skb, rt) < 0)
			goto nla_put_failure;

		list_for_each_entry_safe(sibling, next_sibling,
					 &rt->rt6i_siblings, rt6i_siblings) {
			if (rt6_add_nexthop(skb, sibling) < 0)
				goto nla_put_failure;
		}

		nla_nest_end(skb, mp);
	} else {
4608
		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4609 4610 4611
			goto nla_put_failure;
	}

4612 4613 4614 4615
	if (rt->rt6i_flags & RTF_EXPIRES) {
		expires = dst ? dst->expires : rt->expires;
		expires -= jiffies;
	}
4616

4617
	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4618
		goto nla_put_failure;
4619

4620 4621 4622
	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
		goto nla_put_failure;

4623

4624 4625
	nlmsg_end(skb, nlh);
	return 0;
4626 4627

nla_put_failure:
4628 4629
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
4630 4631
}

4632
int rt6_dump_route(struct rt6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4633 4634
{
	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4635 4636
	struct net *net = arg->net;

D
David Ahern 已提交
4637
	if (rt == net->ipv6.fib6_null_entry)
4638
		return 0;
L
Linus Torvalds 已提交
4639

4640 4641
	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4642 4643 4644 4645 4646 4647 4648 4649

		/* user wants prefix routes only */
		if (rtm->rtm_flags & RTM_F_PREFIX &&
		    !(rt->rt6i_flags & RTF_PREFIX_RT)) {
			/* success since this is not a prefix route */
			return 1;
		}
	}
L
Linus Torvalds 已提交
4650

4651 4652 4653
	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
L
Linus Torvalds 已提交
4654 4655
}

4656 4657
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4658
{
4659
	struct net *net = sock_net(in_skb->sk);
4660
	struct nlattr *tb[RTA_MAX+1];
4661 4662
	int err, iif = 0, oif = 0;
	struct dst_entry *dst;
4663
	struct rt6_info *rt;
L
Linus Torvalds 已提交
4664
	struct sk_buff *skb;
4665
	struct rtmsg *rtm;
4666
	struct flowi6 fl6;
4667
	bool fibmatch;
L
Linus Torvalds 已提交
4668

4669
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4670
			  extack);
4671 4672
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
4673

4674
	err = -EINVAL;
4675
	memset(&fl6, 0, sizeof(fl6));
4676 4677
	rtm = nlmsg_data(nlh);
	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4678
	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
L
Linus Torvalds 已提交
4679

4680 4681 4682 4683
	if (tb[RTA_SRC]) {
		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
4684
		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4685 4686 4687 4688 4689 4690
	}

	if (tb[RTA_DST]) {
		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
4691
		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4692 4693 4694 4695 4696 4697
	}

	if (tb[RTA_IIF])
		iif = nla_get_u32(tb[RTA_IIF]);

	if (tb[RTA_OIF])
4698
		oif = nla_get_u32(tb[RTA_OIF]);
L
Linus Torvalds 已提交
4699

4700 4701 4702
	if (tb[RTA_MARK])
		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);

4703 4704 4705 4706 4707 4708
	if (tb[RTA_UID])
		fl6.flowi6_uid = make_kuid(current_user_ns(),
					   nla_get_u32(tb[RTA_UID]));
	else
		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();

L
Linus Torvalds 已提交
4709 4710
	if (iif) {
		struct net_device *dev;
4711 4712
		int flags = 0;

4713 4714 4715
		rcu_read_lock();

		dev = dev_get_by_index_rcu(net, iif);
L
Linus Torvalds 已提交
4716
		if (!dev) {
4717
			rcu_read_unlock();
L
Linus Torvalds 已提交
4718
			err = -ENODEV;
4719
			goto errout;
L
Linus Torvalds 已提交
4720
		}
4721 4722 4723 4724 4725 4726

		fl6.flowi6_iif = iif;

		if (!ipv6_addr_any(&fl6.saddr))
			flags |= RT6_LOOKUP_F_HAS_SADDR;

D
David Ahern 已提交
4727
		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4728 4729

		rcu_read_unlock();
4730 4731 4732
	} else {
		fl6.flowi6_oif = oif;

4733
		dst = ip6_route_output(net, NULL, &fl6);
4734 4735 4736 4737 4738 4739 4740 4741
	}


	rt = container_of(dst, struct rt6_info, dst);
	if (rt->dst.error) {
		err = rt->dst.error;
		ip6_rt_put(rt);
		goto errout;
L
Linus Torvalds 已提交
4742 4743
	}

4744 4745 4746 4747 4748 4749
	if (rt == net->ipv6.ip6_null_entry) {
		err = rt->dst.error;
		ip6_rt_put(rt);
		goto errout;
	}

4750 4751
	if (fibmatch && rt->from) {
		struct rt6_info *ort = rt->from;
4752 4753 4754 4755 4756 4757

		dst_hold(&ort->dst);
		ip6_rt_put(rt);
		rt = ort;
	}

4758
	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4759
	if (!skb) {
A
Amerigo Wang 已提交
4760
		ip6_rt_put(rt);
4761 4762 4763
		err = -ENOBUFS;
		goto errout;
	}
L
Linus Torvalds 已提交
4764

4765
	skb_dst_set(skb, &rt->dst);
4766
	if (fibmatch)
4767
		err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, iif,
4768 4769 4770
				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
				    nlh->nlmsg_seq, 0);
	else
4771 4772 4773 4774
		err = rt6_fill_node(net, skb, rt, dst, &fl6.daddr, &fl6.saddr,
				    iif, RTM_NEWROUTE,
				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
				    0);
L
Linus Torvalds 已提交
4775
	if (err < 0) {
4776 4777
		kfree_skb(skb);
		goto errout;
L
Linus Torvalds 已提交
4778 4779
	}

4780
	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4781
errout:
L
Linus Torvalds 已提交
4782 4783 4784
	return err;
}

4785 4786
void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
		     unsigned int nlm_flags)
L
Linus Torvalds 已提交
4787 4788
{
	struct sk_buff *skb;
4789
	struct net *net = info->nl_net;
4790 4791 4792 4793
	u32 seq;
	int err;

	err = -ENOBUFS;
4794
	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4795

4796
	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4797
	if (!skb)
4798 4799
		goto errout;

4800 4801
	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
			    event, info->portid, seq, nlm_flags);
4802 4803 4804 4805 4806 4807
	if (err < 0) {
		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
4808
	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4809 4810
		    info->nlh, gfp_any());
	return;
4811 4812
errout:
	if (err < 0)
4813
		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
L
Linus Torvalds 已提交
4814 4815
}

4816
static int ip6_route_dev_notify(struct notifier_block *this,
4817
				unsigned long event, void *ptr)
4818
{
4819
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4820
	struct net *net = dev_net(dev);
4821

4822 4823 4824 4825
	if (!(dev->flags & IFF_LOOPBACK))
		return NOTIFY_OK;

	if (event == NETDEV_REGISTER) {
D
David Ahern 已提交
4826 4827
		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
		net->ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(dev);
4828
		net->ipv6.ip6_null_entry->dst.dev = dev;
4829 4830
		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4831
		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4832
		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4833
		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4834
		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4835
#endif
4836 4837 4838 4839 4840
	 } else if (event == NETDEV_UNREGISTER &&
		    dev->reg_state != NETREG_UNREGISTERED) {
		/* NETDEV_UNREGISTER could be fired for multiple times by
		 * netdev_wait_allrefs(). Make sure we only call this once.
		 */
D
David Ahern 已提交
4841
		in6_dev_put_clear(&net->ipv6.fib6_null_entry->rt6i_idev);
4842
		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4843
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4844 4845
		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4846 4847 4848 4849 4850 4851
#endif
	}

	return NOTIFY_OK;
}

L
Linus Torvalds 已提交
4852 4853 4854 4855 4856 4857
/*
 *	/proc
 */

#ifdef CONFIG_PROC_FS

4858 4859 4860 4861
static const struct file_operations ipv6_route_proc_fops = {
	.open		= ipv6_route_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
4862
	.release	= seq_release_net,
4863 4864
};

L
Linus Torvalds 已提交
4865 4866
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
4867
	struct net *net = (struct net *)seq->private;
L
Linus Torvalds 已提交
4868
	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4869 4870
		   net->ipv6.rt6_stats->fib_nodes,
		   net->ipv6.rt6_stats->fib_route_nodes,
W
Wei Wang 已提交
4871
		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4872 4873
		   net->ipv6.rt6_stats->fib_rt_entries,
		   net->ipv6.rt6_stats->fib_rt_cache,
4874
		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4875
		   net->ipv6.rt6_stats->fib_discarded_routes);
L
Linus Torvalds 已提交
4876 4877 4878 4879 4880 4881

	return 0;
}

static int rt6_stats_seq_open(struct inode *inode, struct file *file)
{
4882
	return single_open_net(inode, file, rt6_stats_seq_show);
4883 4884
}

4885
static const struct file_operations rt6_stats_seq_fops = {
L
Linus Torvalds 已提交
4886 4887 4888
	.open	 = rt6_stats_seq_open,
	.read	 = seq_read,
	.llseek	 = seq_lseek,
4889
	.release = single_release_net,
L
Linus Torvalds 已提交
4890 4891 4892 4893 4894 4895
};
#endif	/* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static
4896
int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
L
Linus Torvalds 已提交
4897 4898
			      void __user *buffer, size_t *lenp, loff_t *ppos)
{
4899 4900 4901
	struct net *net;
	int delay;
	if (!write)
L
Linus Torvalds 已提交
4902
		return -EINVAL;
4903 4904 4905 4906

	net = (struct net *)ctl->extra1;
	delay = net->ipv6.sysctl.flush_delay;
	proc_dointvec(ctl, write, buffer, lenp, ppos);
4907
	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4908
	return 0;
L
Linus Torvalds 已提交
4909 4910
}

4911
struct ctl_table ipv6_route_table_template[] = {
4912
	{
L
Linus Torvalds 已提交
4913
		.procname	=	"flush",
4914
		.data		=	&init_net.ipv6.sysctl.flush_delay,
L
Linus Torvalds 已提交
4915
		.maxlen		=	sizeof(int),
4916
		.mode		=	0200,
A
Alexey Dobriyan 已提交
4917
		.proc_handler	=	ipv6_sysctl_rtcache_flush
L
Linus Torvalds 已提交
4918 4919 4920
	},
	{
		.procname	=	"gc_thresh",
4921
		.data		=	&ip6_dst_ops_template.gc_thresh,
L
Linus Torvalds 已提交
4922 4923
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
4924
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
4925 4926 4927
	},
	{
		.procname	=	"max_size",
4928
		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
L
Linus Torvalds 已提交
4929 4930
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
4931
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
4932 4933 4934
	},
	{
		.procname	=	"gc_min_interval",
4935
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
4936 4937
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
4938
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
4939 4940 4941
	},
	{
		.procname	=	"gc_timeout",
4942
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
L
Linus Torvalds 已提交
4943 4944
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
4945
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
4946 4947 4948
	},
	{
		.procname	=	"gc_interval",
4949
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
L
Linus Torvalds 已提交
4950 4951
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
4952
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
4953 4954 4955
	},
	{
		.procname	=	"gc_elasticity",
4956
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
L
Linus Torvalds 已提交
4957 4958
		.maxlen		=	sizeof(int),
		.mode		=	0644,
4959
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
4960 4961 4962
	},
	{
		.procname	=	"mtu_expires",
4963
		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
L
Linus Torvalds 已提交
4964 4965
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
4966
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
4967 4968 4969
	},
	{
		.procname	=	"min_adv_mss",
4970
		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
L
Linus Torvalds 已提交
4971 4972
		.maxlen		=	sizeof(int),
		.mode		=	0644,
4973
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
4974 4975 4976
	},
	{
		.procname	=	"gc_min_interval_ms",
4977
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
4978 4979
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
4980
		.proc_handler	=	proc_dointvec_ms_jiffies,
L
Linus Torvalds 已提交
4981
	},
4982
	{ }
L
Linus Torvalds 已提交
4983 4984
};

4985
struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4986 4987 4988 4989 4990 4991
{
	struct ctl_table *table;

	table = kmemdup(ipv6_route_table_template,
			sizeof(ipv6_route_table_template),
			GFP_KERNEL);
4992 4993 4994

	if (table) {
		table[0].data = &net->ipv6.sysctl.flush_delay;
4995
		table[0].extra1 = net;
4996
		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4997 4998 4999 5000 5001 5002 5003
		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5004
		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5005 5006 5007 5008

		/* Don't export sysctls to unprivileged users */
		if (net->user_ns != &init_user_ns)
			table[0].procname = NULL;
5009 5010
	}

5011 5012
	return table;
}
L
Linus Torvalds 已提交
5013 5014
#endif

5015
static int __net_init ip6_route_net_init(struct net *net)
5016
{
5017
	int ret = -ENOMEM;
5018

5019 5020
	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
	       sizeof(net->ipv6.ip6_dst_ops));
5021

5022 5023 5024
	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
		goto out_ip6_dst_ops;

D
David Ahern 已提交
5025 5026 5027 5028 5029 5030
	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
					    sizeof(*net->ipv6.fib6_null_entry),
					    GFP_KERNEL);
	if (!net->ipv6.fib6_null_entry)
		goto out_ip6_dst_entries;

5031 5032 5033 5034
	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
					   sizeof(*net->ipv6.ip6_null_entry),
					   GFP_KERNEL);
	if (!net->ipv6.ip6_null_entry)
D
David Ahern 已提交
5035
		goto out_fib6_null_entry;
5036
	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5037 5038
	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
			 ip6_template_metrics, true);
5039 5040

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5041
	net->ipv6.fib6_has_custom_rules = false;
5042 5043 5044
	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
					       sizeof(*net->ipv6.ip6_prohibit_entry),
					       GFP_KERNEL);
5045 5046
	if (!net->ipv6.ip6_prohibit_entry)
		goto out_ip6_null_entry;
5047
	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5048 5049
	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
			 ip6_template_metrics, true);
5050 5051 5052 5053

	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
					       sizeof(*net->ipv6.ip6_blk_hole_entry),
					       GFP_KERNEL);
5054 5055
	if (!net->ipv6.ip6_blk_hole_entry)
		goto out_ip6_prohibit_entry;
5056
	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5057 5058
	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
			 ip6_template_metrics, true);
5059 5060
#endif

5061 5062 5063 5064 5065 5066 5067 5068 5069
	net->ipv6.sysctl.flush_delay = 0;
	net->ipv6.sysctl.ip6_rt_max_size = 4096;
	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;

5070 5071
	net->ipv6.ip6_rt_gc_expire = 30*HZ;

5072 5073 5074
	ret = 0;
out:
	return ret;
5075

5076 5077 5078 5079 5080 5081
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
	kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
	kfree(net->ipv6.ip6_null_entry);
#endif
D
David Ahern 已提交
5082 5083
out_fib6_null_entry:
	kfree(net->ipv6.fib6_null_entry);
5084 5085
out_ip6_dst_entries:
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5086 5087
out_ip6_dst_ops:
	goto out;
5088 5089
}

5090
static void __net_exit ip6_route_net_exit(struct net *net)
5091
{
D
David Ahern 已提交
5092
	kfree(net->ipv6.fib6_null_entry);
5093 5094 5095 5096 5097
	kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	kfree(net->ipv6.ip6_prohibit_entry);
	kfree(net->ipv6.ip6_blk_hole_entry);
#endif
5098
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5099 5100
}

5101 5102 5103
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
5104
	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5105
	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5106 5107 5108 5109 5110 5111 5112
#endif
	return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
5113 5114
	remove_proc_entry("ipv6_route", net->proc_net);
	remove_proc_entry("rt6_stats", net->proc_net);
5115 5116 5117
#endif
}

5118 5119 5120 5121 5122
static struct pernet_operations ip6_route_net_ops = {
	.init = ip6_route_net_init,
	.exit = ip6_route_net_exit,
};

5123 5124 5125 5126 5127 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138
static int __net_init ipv6_inetpeer_init(struct net *net)
{
	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

	if (!bp)
		return -ENOMEM;
	inet_peer_base_init(bp);
	net->ipv6.peers = bp;
	return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
	struct inet_peer_base *bp = net->ipv6.peers;

	net->ipv6.peers = NULL;
5139
	inetpeer_invalidate_tree(bp);
5140 5141 5142
	kfree(bp);
}

5143
static struct pernet_operations ipv6_inetpeer_ops = {
5144 5145 5146 5147
	.init	=	ipv6_inetpeer_init,
	.exit	=	ipv6_inetpeer_exit,
};

5148 5149 5150 5151 5152
static struct pernet_operations ip6_route_net_late_ops = {
	.init = ip6_route_net_init_late,
	.exit = ip6_route_net_exit_late,
};

5153 5154
static struct notifier_block ip6_route_dev_notifier = {
	.notifier_call = ip6_route_dev_notify,
5155
	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5156 5157
};

5158 5159 5160 5161 5162
void __init ip6_route_init_special_entries(void)
{
	/* Registering of the loopback is done before this portion of code,
	 * the loopback reference in rt6_info will not be taken, do it
	 * manually for init_net */
D
David Ahern 已提交
5163 5164
	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
	init_net.ipv6.fib6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5165 5166 5167 5168 5169 5170 5171 5172 5173 5174
	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
}

5175
int __init ip6_route_init(void)
L
Linus Torvalds 已提交
5176
{
5177
	int ret;
5178
	int cpu;
5179

5180 5181
	ret = -ENOMEM;
	ip6_dst_ops_template.kmem_cachep =
A
Alexey Dobriyan 已提交
5182
		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5183
				  SLAB_HWCACHE_ALIGN, NULL);
5184
	if (!ip6_dst_ops_template.kmem_cachep)
5185
		goto out;
5186

5187
	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5188
	if (ret)
5189 5190
		goto out_kmem_cache;

5191 5192
	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
	if (ret)
5193
		goto out_dst_entries;
5194

5195 5196 5197
	ret = register_pernet_subsys(&ip6_route_net_ops);
	if (ret)
		goto out_register_inetpeer;
5198

5199 5200
	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

5201
	ret = fib6_init();
5202
	if (ret)
5203
		goto out_register_subsys;
5204 5205 5206

	ret = xfrm6_init();
	if (ret)
5207
		goto out_fib6_init;
5208

5209 5210 5211
	ret = fib6_rules_init();
	if (ret)
		goto xfrm6_init;
5212

5213 5214 5215 5216
	ret = register_pernet_subsys(&ip6_route_net_late_ops);
	if (ret)
		goto fib6_rules_init;

5217 5218 5219 5220 5221 5222 5223 5224 5225 5226 5227 5228 5229 5230
	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
				   inet6_rtm_newroute, NULL, 0);
	if (ret < 0)
		goto out_register_late_subsys;

	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
				   inet6_rtm_delroute, NULL, 0);
	if (ret < 0)
		goto out_register_late_subsys;

	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
				   inet6_rtm_getroute, NULL,
				   RTNL_FLAG_DOIT_UNLOCKED);
	if (ret < 0)
5231
		goto out_register_late_subsys;
5232

5233
	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5234
	if (ret)
5235
		goto out_register_late_subsys;
5236

5237 5238 5239 5240 5241 5242 5243
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

		INIT_LIST_HEAD(&ul->head);
		spin_lock_init(&ul->lock);
	}

5244 5245 5246
out:
	return ret;

5247
out_register_late_subsys:
5248
	rtnl_unregister_all(PF_INET6);
5249
	unregister_pernet_subsys(&ip6_route_net_late_ops);
5250 5251 5252 5253
fib6_rules_init:
	fib6_rules_cleanup();
xfrm6_init:
	xfrm6_fini();
5254 5255
out_fib6_init:
	fib6_gc_cleanup();
5256 5257
out_register_subsys:
	unregister_pernet_subsys(&ip6_route_net_ops);
5258 5259
out_register_inetpeer:
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5260 5261
out_dst_entries:
	dst_entries_destroy(&ip6_dst_blackhole_ops);
5262
out_kmem_cache:
5263
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5264
	goto out;
L
Linus Torvalds 已提交
5265 5266 5267 5268
}

void ip6_route_cleanup(void)
{
5269
	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5270
	unregister_pernet_subsys(&ip6_route_net_late_ops);
T
Thomas Graf 已提交
5271
	fib6_rules_cleanup();
L
Linus Torvalds 已提交
5272 5273
	xfrm6_fini();
	fib6_gc_cleanup();
5274
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5275
	unregister_pernet_subsys(&ip6_route_net_ops);
5276
	dst_entries_destroy(&ip6_dst_blackhole_ops);
5277
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
L
Linus Torvalds 已提交
5278
}