route.c 130.1 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
L
Linus Torvalds 已提交
25 26
 */

27 28
#define pr_fmt(fmt) "IPv6: " fmt

29
#include <linux/capability.h>
L
Linus Torvalds 已提交
30
#include <linux/errno.h>
31
#include <linux/export.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
40
#include <linux/mroute6.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
45
#include <linux/nsproxy.h>
46
#include <linux/slab.h>
47
#include <linux/jhash.h>
48
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
49 50 51 52 53 54 55 56 57
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
58
#include <net/dst_metadata.h>
L
Linus Torvalds 已提交
59
#include <net/xfrm.h>
60
#include <net/netevent.h>
61
#include <net/netlink.h>
62
#include <net/nexthop.h>
63
#include <net/lwtunnel.h>
64
#include <net/ip_tunnels.h>
D
David Ahern 已提交
65
#include <net/l3mdev.h>
D
David Ahern 已提交
66
#include <trace/events/fib6.h>
L
Linus Torvalds 已提交
67

68
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
69 70 71 72 73

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

74
enum rt6_nud_state {
J
Jiri Benc 已提交
75 76 77
	RT6_NUD_FAIL_HARD = -3,
	RT6_NUD_FAIL_PROBE = -2,
	RT6_NUD_FAIL_DO_RR = -1,
78 79 80
	RT6_NUD_SUCCEED = 1
};

L
Linus Torvalds 已提交
81
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
82
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
83
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
L
Linus Torvalds 已提交
84 85 86 87
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
88
static int		 ip6_dst_gc(struct dst_ops *ops);
L
Linus Torvalds 已提交
89 90

static int		ip6_pkt_discard(struct sk_buff *skb);
E
Eric W. Biederman 已提交
91
static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92
static int		ip6_pkt_prohibit(struct sk_buff *skb);
E
Eric W. Biederman 已提交
93
static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
L
Linus Torvalds 已提交
94
static void		ip6_link_failure(struct sk_buff *skb);
95 96 97 98
static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
					   struct sk_buff *skb, u32 mtu);
static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
					struct sk_buff *skb);
99 100
static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
static size_t rt6_nlmsg_size(struct fib6_info *rt);
101
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
102
			 struct fib6_info *rt, struct dst_entry *dst,
103
			 struct in6_addr *dest, struct in6_addr *src,
104 105
			 int iif, int type, u32 portid, u32 seq,
			 unsigned int flags);
106
static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
107 108
					   struct in6_addr *daddr,
					   struct in6_addr *saddr);
L
Linus Torvalds 已提交
109

110
#ifdef CONFIG_IPV6_ROUTE_INFO
111
static struct fib6_info *rt6_add_route_info(struct net *net,
112
					   const struct in6_addr *prefix, int prefixlen,
113 114
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
115
					   unsigned int pref);
116
static struct fib6_info *rt6_get_route_info(struct net *net,
117
					   const struct in6_addr *prefix, int prefixlen,
118 119
					   const struct in6_addr *gwaddr,
					   struct net_device *dev);
120 121
#endif

122 123 124 125 126 127 128
struct uncached_list {
	spinlock_t		lock;
	struct list_head	head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

129
void rt6_uncached_list_add(struct rt6_info *rt)
130 131 132 133 134 135 136 137 138 139
{
	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

	rt->rt6i_uncached_list = ul;

	spin_lock_bh(&ul->lock);
	list_add_tail(&rt->rt6i_uncached, &ul->head);
	spin_unlock_bh(&ul->lock);
}

140
void rt6_uncached_list_del(struct rt6_info *rt)
141 142 143
{
	if (!list_empty(&rt->rt6i_uncached)) {
		struct uncached_list *ul = rt->rt6i_uncached_list;
W
Wei Wang 已提交
144
		struct net *net = dev_net(rt->dst.dev);
145 146 147

		spin_lock_bh(&ul->lock);
		list_del(&rt->rt6i_uncached);
W
Wei Wang 已提交
148
		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
149 150 151 152 153 154 155 156 157
		spin_unlock_bh(&ul->lock);
	}
}

static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
	struct net_device *loopback_dev = net->loopback_dev;
	int cpu;

158 159 160
	if (dev == loopback_dev)
		return;

161 162 163 164 165 166 167 168 169
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
		struct rt6_info *rt;

		spin_lock_bh(&ul->lock);
		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
			struct inet6_dev *rt_idev = rt->rt6i_idev;
			struct net_device *rt_dev = rt->dst.dev;

170
			if (rt_idev->dev == dev) {
171 172 173 174
				rt->rt6i_idev = in6_dev_get(loopback_dev);
				in6_dev_put(rt_idev);
			}

175
			if (rt_dev == dev) {
176 177 178 179 180 181 182 183 184
				rt->dst.dev = loopback_dev;
				dev_hold(rt->dst.dev);
				dev_put(rt_dev);
			}
		}
		spin_unlock_bh(&ul->lock);
	}
}

185
static inline const void *choose_neigh_daddr(const struct in6_addr *p,
186 187
					     struct sk_buff *skb,
					     const void *daddr)
188
{
D
David S. Miller 已提交
189
	if (!ipv6_addr_any(p))
190
		return (const void *) p;
191 192
	else if (skb)
		return &ipv6_hdr(skb)->daddr;
193 194 195
	return daddr;
}

196 197 198 199
struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
				   struct net_device *dev,
				   struct sk_buff *skb,
				   const void *daddr)
200
{
201 202
	struct neighbour *n;

203 204
	daddr = choose_neigh_daddr(gw, skb, daddr);
	n = __ipv6_neigh_lookup(dev, daddr);
205 206
	if (n)
		return n;
207 208 209 210 211 212 213 214 215 216
	return neigh_create(&nd_tbl, daddr, dev);
}

static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
					      struct sk_buff *skb,
					      const void *daddr)
{
	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);

	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
217 218
}

219 220 221 222 223
static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
	struct net_device *dev = dst->dev;
	struct rt6_info *rt = (struct rt6_info *)dst;

224
	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
225 226 227 228 229 230 231 232 233
	if (!daddr)
		return;
	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
		return;
	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
		return;
	__ipv6_confirm_neigh(dev, daddr);
}

234
static struct dst_ops ip6_dst_ops_template = {
L
Linus Torvalds 已提交
235 236 237 238
	.family			=	AF_INET6,
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
239
	.default_advmss		=	ip6_default_advmss,
240
	.mtu			=	ip6_mtu,
241
	.cow_metrics		=	dst_cow_metrics_generic,
L
Linus Torvalds 已提交
242 243 244 245 246
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
247
	.redirect		=	rt6_do_redirect,
248
	.local_out		=	__ip6_local_out,
249
	.neigh_lookup		=	ip6_dst_neigh_lookup,
250
	.confirm_neigh		=	ip6_confirm_neigh,
L
Linus Torvalds 已提交
251 252
};

253
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
254
{
255 256 257
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
258 259
}

260 261
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
					 struct sk_buff *skb, u32 mtu)
262 263 264
{
}

265 266
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
				      struct sk_buff *skb)
267 268 269
{
}

270 271 272 273
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
274
	.mtu			=	ip6_blackhole_mtu,
275
	.default_advmss		=	ip6_default_advmss,
276
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
277
	.redirect		=	ip6_rt_blackhole_redirect,
278
	.cow_metrics		=	dst_cow_metrics_generic,
279
	.neigh_lookup		=	ip6_dst_neigh_lookup,
280 281
};

282
static const u32 ip6_template_metrics[RTAX_MAX] = {
L
Li RongQing 已提交
283
	[RTAX_HOPLIMIT - 1] = 0,
284 285
};

286
static const struct fib6_info fib6_null_entry_template = {
287 288 289 290
	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
	.fib6_protocol  = RTPROT_KERNEL,
	.fib6_metric	= ~(u32)0,
	.fib6_ref	= ATOMIC_INIT(1),
D
David Ahern 已提交
291 292 293 294
	.fib6_type	= RTN_UNREACHABLE,
	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
};

295
static const struct rt6_info ip6_null_entry_template = {
296 297 298
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
299
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
300 301 302
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
L
Linus Torvalds 已提交
303 304 305 306
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

T
Thomas Graf 已提交
307 308
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

309
static const struct rt6_info ip6_prohibit_entry_template = {
310 311 312
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
313
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
314 315 316
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
T
Thomas Graf 已提交
317 318 319 320
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

321
static const struct rt6_info ip6_blk_hole_entry_template = {
322 323 324
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
325
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
326 327
		.error		= -EINVAL,
		.input		= dst_discard,
E
Eric W. Biederman 已提交
328
		.output		= dst_discard_out,
T
Thomas Graf 已提交
329 330 331 332 333 334
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

#endif

335 336 337 338 339 340 341 342
static void rt6_info_init(struct rt6_info *rt)
{
	struct dst_entry *dst = &rt->dst;

	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
	INIT_LIST_HEAD(&rt->rt6i_uncached);
}

L
Linus Torvalds 已提交
343
/* allocate dst with ip6_dst_ops */
344 345
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
			       int flags)
L
Linus Torvalds 已提交
346
{
347
	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
W
Wei Wang 已提交
348
					1, DST_OBSOLETE_FORCE_CHK, flags);
349

W
Wei Wang 已提交
350
	if (rt) {
351
		rt6_info_init(rt);
W
Wei Wang 已提交
352 353
		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
	}
354

355
	return rt;
L
Linus Torvalds 已提交
356
}
357
EXPORT_SYMBOL(ip6_dst_alloc);
M
Martin KaFai Lau 已提交
358

L
Linus Torvalds 已提交
359 360 361
static void ip6_dst_destroy(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
362
	struct fib6_info *from;
363
	struct inet6_dev *idev;
L
Linus Torvalds 已提交
364

365
	dst_destroy_metrics_generic(dst);
366 367 368
	rt6_uncached_list_del(rt);

	idev = rt->rt6i_idev;
369
	if (idev) {
L
Linus Torvalds 已提交
370 371
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
372
	}
373

374 375 376
	rcu_read_lock();
	from = rcu_dereference(rt->from);
	rcu_assign_pointer(rt->from, NULL);
377
	fib6_info_release(from);
378
	rcu_read_unlock();
379 380
}

L
Linus Torvalds 已提交
381 382 383 384 385
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
386
	struct net_device *loopback_dev =
387
		dev_net(dev)->loopback_dev;
L
Linus Torvalds 已提交
388

389 390 391 392 393
	if (idev && idev->dev != loopback_dev) {
		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
		if (loopback_idev) {
			rt->rt6i_idev = loopback_idev;
			in6_dev_put(idev);
394
		}
L
Linus Torvalds 已提交
395 396 397
	}
}

398 399 400 401 402 403 404 405
static bool __rt6_check_expired(const struct rt6_info *rt)
{
	if (rt->rt6i_flags & RTF_EXPIRES)
		return time_after(jiffies, rt->dst.expires);
	else
		return false;
}

406
static bool rt6_check_expired(const struct rt6_info *rt)
L
Linus Torvalds 已提交
407
{
408 409 410 411
	struct fib6_info *from;

	from = rcu_dereference(rt->from);

412 413
	if (rt->rt6i_flags & RTF_EXPIRES) {
		if (time_after(jiffies, rt->dst.expires))
414
			return true;
415
	} else if (from) {
416
		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
417
			fib6_check_expired(from);
418
	}
419
	return false;
L
Linus Torvalds 已提交
420 421
}

422 423
static struct fib6_info *rt6_multipath_select(const struct net *net,
					      struct fib6_info *match,
424
					     struct flowi6 *fl6, int oif,
D
David Ahern 已提交
425
					     const struct sk_buff *skb,
426
					     int strict)
427
{
428
	struct fib6_info *sibling, *next_sibling;
429

430 431 432 433
	/* We might have already computed the hash for ICMPv6 errors. In such
	 * case it will always be non-zero. Otherwise now is the time to do it.
	 */
	if (!fl6->mp_hash)
434
		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
435

436
	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
437 438
		return match;

439 440
	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
				 fib6_siblings) {
441 442 443 444
		int nh_upper_bound;

		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
		if (fl6->mp_hash > nh_upper_bound)
445 446 447 448 449 450 451
			continue;
		if (rt6_score_route(sibling, oif, strict) < 0)
			break;
		match = sibling;
		break;
	}

452 453 454
	return match;
}

L
Linus Torvalds 已提交
455
/*
456
 *	Route lookup. rcu_read_lock() should be held.
L
Linus Torvalds 已提交
457 458
 */

459 460
static inline struct fib6_info *rt6_device_match(struct net *net,
						 struct fib6_info *rt,
461
						    const struct in6_addr *saddr,
L
Linus Torvalds 已提交
462
						    int oif,
463
						    int flags)
L
Linus Torvalds 已提交
464
{
465
	struct fib6_info *sprt;
L
Linus Torvalds 已提交
466

467 468
	if (!oif && ipv6_addr_any(saddr) &&
	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
469
		return rt;
470

471
	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
472
		const struct net_device *dev = sprt->fib6_nh.nh_dev;
473

474
		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
475 476
			continue;

477
		if (oif) {
L
Linus Torvalds 已提交
478 479
			if (dev->ifindex == oif)
				return sprt;
480 481 482 483
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
L
Linus Torvalds 已提交
484
		}
485
	}
L
Linus Torvalds 已提交
486

487 488
	if (oif && flags & RT6_LOOKUP_F_IFACE)
		return net->ipv6.fib6_null_entry;
489

D
David Ahern 已提交
490
	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
L
Linus Torvalds 已提交
491 492
}

493
#ifdef CONFIG_IPV6_ROUTER_PREF
494 495 496 497 498 499 500 501 502 503 504 505 506
struct __rt6_probe_work {
	struct work_struct work;
	struct in6_addr target;
	struct net_device *dev;
};

static void rt6_probe_deferred(struct work_struct *w)
{
	struct in6_addr mcaddr;
	struct __rt6_probe_work *work =
		container_of(w, struct __rt6_probe_work, work);

	addrconf_addr_solict_mult(&work->target, &mcaddr);
507
	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
508
	dev_put(work->dev);
509
	kfree(work);
510 511
}

512
static void rt6_probe(struct fib6_info *rt)
513
{
514
	struct __rt6_probe_work *work;
515
	const struct in6_addr *nh_gw;
516
	struct neighbour *neigh;
517 518
	struct net_device *dev;

519 520 521 522 523 524 525 526
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
527
	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
528
		return;
529 530 531

	nh_gw = &rt->fib6_nh.nh_gw;
	dev = rt->fib6_nh.nh_dev;
532
	rcu_read_lock_bh();
533
	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
534
	if (neigh) {
D
David Ahern 已提交
535 536
		struct inet6_dev *idev;

537 538 539
		if (neigh->nud_state & NUD_VALID)
			goto out;

D
David Ahern 已提交
540
		idev = __in6_dev_get(dev);
541
		work = NULL;
542
		write_lock(&neigh->lock);
543 544
		if (!(neigh->nud_state & NUD_VALID) &&
		    time_after(jiffies,
D
David Ahern 已提交
545
			       neigh->updated + idev->cnf.rtr_probe_interval)) {
546 547 548
			work = kmalloc(sizeof(*work), GFP_ATOMIC);
			if (work)
				__neigh_set_probe_once(neigh);
549
		}
550
		write_unlock(&neigh->lock);
551 552
	} else {
		work = kmalloc(sizeof(*work), GFP_ATOMIC);
553
	}
554 555 556

	if (work) {
		INIT_WORK(&work->work, rt6_probe_deferred);
557 558 559
		work->target = *nh_gw;
		dev_hold(dev);
		work->dev = dev;
560 561 562
		schedule_work(&work->work);
	}

563
out:
564
	rcu_read_unlock_bh();
565 566
}
#else
567
static inline void rt6_probe(struct fib6_info *rt)
568 569 570 571
{
}
#endif

L
Linus Torvalds 已提交
572
/*
573
 * Default Router Selection (RFC 2461 6.3.6)
L
Linus Torvalds 已提交
574
 */
575
static inline int rt6_check_dev(struct fib6_info *rt, int oif)
576
{
577 578
	const struct net_device *dev = rt->fib6_nh.nh_dev;

579
	if (!oif || dev->ifindex == oif)
580
		return 2;
581
	return 0;
582
}
L
Linus Torvalds 已提交
583

584
static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
L
Linus Torvalds 已提交
585
{
586
	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
587
	struct neighbour *neigh;
588

589 590
	if (rt->fib6_flags & RTF_NONEXTHOP ||
	    !(rt->fib6_flags & RTF_GATEWAY))
591
		return RT6_NUD_SUCCEED;
592 593

	rcu_read_lock_bh();
594 595
	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
					  &rt->fib6_nh.nh_gw);
596 597
	if (neigh) {
		read_lock(&neigh->lock);
598
		if (neigh->nud_state & NUD_VALID)
599
			ret = RT6_NUD_SUCCEED;
600
#ifdef CONFIG_IPV6_ROUTER_PREF
601
		else if (!(neigh->nud_state & NUD_FAILED))
602
			ret = RT6_NUD_SUCCEED;
J
Jiri Benc 已提交
603 604
		else
			ret = RT6_NUD_FAIL_PROBE;
605
#endif
606
		read_unlock(&neigh->lock);
607 608
	} else {
		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
J
Jiri Benc 已提交
609
		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
610
	}
611 612
	rcu_read_unlock_bh();

613
	return ret;
L
Linus Torvalds 已提交
614 615
}

616
static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
L
Linus Torvalds 已提交
617
{
618
	int m;
619

620
	m = rt6_check_dev(rt, oif);
621
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
622
		return RT6_NUD_FAIL_HARD;
623
#ifdef CONFIG_IPV6_ROUTER_PREF
624
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
625
#endif
626 627 628 629 630
	if (strict & RT6_LOOKUP_F_REACHABLE) {
		int n = rt6_check_neigh(rt);
		if (n < 0)
			return n;
	}
631 632 633
	return m;
}

D
David Ahern 已提交
634 635 636 637 638 639 640 641 642 643 644 645 646 647 648
/* called with rc_read_lock held */
static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
{
	const struct net_device *dev = fib6_info_nh_dev(f6i);
	bool rc = false;

	if (dev) {
		const struct inet6_dev *idev = __in6_dev_get(dev);

		rc = !!idev->cnf.ignore_routes_with_linkdown;
	}

	return rc;
}

649 650
static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
				   int *mpri, struct fib6_info *match,
651
				   bool *do_rr)
652
{
653
	int m;
654
	bool match_do_rr = false;
655

656
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
657 658
		goto out;

D
David Ahern 已提交
659
	if (fib6_ignore_linkdown(rt) &&
660
	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
661
	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
662
		goto out;
663

664
	if (fib6_check_expired(rt))
665 666 667
		goto out;

	m = rt6_score_route(rt, oif, strict);
J
Jiri Benc 已提交
668
	if (m == RT6_NUD_FAIL_DO_RR) {
669 670
		match_do_rr = true;
		m = 0; /* lowest valid score */
J
Jiri Benc 已提交
671
	} else if (m == RT6_NUD_FAIL_HARD) {
672
		goto out;
673 674 675 676
	}

	if (strict & RT6_LOOKUP_F_REACHABLE)
		rt6_probe(rt);
677

J
Jiri Benc 已提交
678
	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
679
	if (m > *mpri) {
680
		*do_rr = match_do_rr;
681 682 683 684 685 686 687
		*mpri = m;
		match = rt;
	}
out:
	return match;
}

688 689 690
static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
				     struct fib6_info *leaf,
				     struct fib6_info *rr_head,
691 692
				     u32 metric, int oif, int strict,
				     bool *do_rr)
693
{
694
	struct fib6_info *rt, *match, *cont;
695
	int mpri = -1;
L
Linus Torvalds 已提交
696

697
	match = NULL;
698
	cont = NULL;
699
	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
700
		if (rt->fib6_metric != metric) {
701 702 703 704 705 706 707
			cont = rt;
			break;
		}

		match = find_match(rt, oif, strict, &mpri, match, do_rr);
	}

708
	for (rt = leaf; rt && rt != rr_head;
709
	     rt = rcu_dereference(rt->fib6_next)) {
710
		if (rt->fib6_metric != metric) {
711 712 713 714
			cont = rt;
			break;
		}

715
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
716 717 718 719 720
	}

	if (match || !cont)
		return match;

721
	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
722
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
L
Linus Torvalds 已提交
723

724 725
	return match;
}
L
Linus Torvalds 已提交
726

727
static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
W
Wei Wang 已提交
728
				   int oif, int strict)
729
{
730 731
	struct fib6_info *leaf = rcu_dereference(fn->leaf);
	struct fib6_info *match, *rt0;
732
	bool do_rr = false;
733
	int key_plen;
L
Linus Torvalds 已提交
734

D
David Ahern 已提交
735 736
	if (!leaf || leaf == net->ipv6.fib6_null_entry)
		return net->ipv6.fib6_null_entry;
W
Wei Wang 已提交
737

738
	rt0 = rcu_dereference(fn->rr_ptr);
739
	if (!rt0)
740
		rt0 = leaf;
L
Linus Torvalds 已提交
741

742 743 744 745 746
	/* Double check to make sure fn is not an intermediate node
	 * and fn->leaf does not points to its child's leaf
	 * (This might happen if all routes under fn are deleted from
	 * the tree and fib6_repair_tree() is called on the node.)
	 */
747
	key_plen = rt0->fib6_dst.plen;
748
#ifdef CONFIG_IPV6_SUBTREES
749 750
	if (rt0->fib6_src.plen)
		key_plen = rt0->fib6_src.plen;
751 752
#endif
	if (fn->fn_bit != key_plen)
D
David Ahern 已提交
753
		return net->ipv6.fib6_null_entry;
754

755
	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
756
			     &do_rr);
L
Linus Torvalds 已提交
757

758
	if (do_rr) {
759
		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
760

761
		/* no entries matched; do round-robin */
762
		if (!next || next->fib6_metric != rt0->fib6_metric)
W
Wei Wang 已提交
763
			next = leaf;
764

765
		if (next != rt0) {
766
			spin_lock_bh(&leaf->fib6_table->tb6_lock);
767
			/* make sure next is not being deleted from the tree */
768
			if (next->fib6_node)
769
				rcu_assign_pointer(fn->rr_ptr, next);
770
			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
771
		}
L
Linus Torvalds 已提交
772 773
	}

D
David Ahern 已提交
774
	return match ? match : net->ipv6.fib6_null_entry;
L
Linus Torvalds 已提交
775 776
}

777
static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
778
{
779
	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
780 781
}

782 783
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
784
		  const struct in6_addr *gwaddr)
785
{
786
	struct net *net = dev_net(dev);
787 788 789
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
790
	unsigned long lifetime;
791
	struct fib6_info *rt;
792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
814
		return -EINVAL;
815

816
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
817 818 819 820 821 822 823 824 825 826 827

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

828
	if (rinfo->prefix_len == 0)
829
		rt = rt6_get_dflt_router(net, gwaddr, dev);
830 831
	else
		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
832
					gwaddr, dev);
833 834

	if (rt && !lifetime) {
835
		ip6_del_rt(net, rt);
836 837 838 839
		rt = NULL;
	}

	if (!rt && lifetime)
840 841
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
					dev, pref);
842
	else if (rt)
843 844
		rt->fib6_flags = RTF_ROUTEINFO |
				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
845 846

	if (rt) {
847
		if (!addrconf_finite_timeout(lifetime))
848
			fib6_clean_expires(rt);
849
		else
850
			fib6_set_expires(rt, jiffies + HZ * lifetime);
851

852
		fib6_info_release(rt);
853 854 855 856 857
	}
	return 0;
}
#endif

858 859 860 861 862
/*
 *	Misc support functions
 */

/* called with rcu_lock held */
863
static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
864
{
865
	struct net_device *dev = rt->fib6_nh.nh_dev;
866

867
	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
868 869 870 871 872
		/* for copies of local routes, dst->dev needs to be the
		 * device if it is a master device, the master device if
		 * device is enslaved, and the loopback as the default
		 */
		if (netif_is_l3_slave(dev) &&
873
		    !rt6_need_strict(&rt->fib6_dst.addr))
874 875 876 877 878 879 880 881 882 883 884
			dev = l3mdev_master_dev_rcu(dev);
		else if (!netif_is_l3_master(dev))
			dev = dev_net(dev)->loopback_dev;
		/* last case is netif_is_l3_master(dev) is true in which
		 * case we want dev returned to be dev
		 */
	}

	return dev;
}

885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904
static const int fib6_prop[RTN_MAX + 1] = {
	[RTN_UNSPEC]	= 0,
	[RTN_UNICAST]	= 0,
	[RTN_LOCAL]	= 0,
	[RTN_BROADCAST]	= 0,
	[RTN_ANYCAST]	= 0,
	[RTN_MULTICAST]	= 0,
	[RTN_BLACKHOLE]	= -EINVAL,
	[RTN_UNREACHABLE] = -EHOSTUNREACH,
	[RTN_PROHIBIT]	= -EACCES,
	[RTN_THROW]	= -EAGAIN,
	[RTN_NAT]	= -EINVAL,
	[RTN_XRESOLVE]	= -EINVAL,
};

static int ip6_rt_type_to_error(u8 fib6_type)
{
	return fib6_prop[fib6_type];
}

905
static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
906 907 908 909 910 911 912 913 914 915 916 917 918
{
	unsigned short flags = 0;

	if (rt->dst_nocount)
		flags |= DST_NOCOUNT;
	if (rt->dst_nopolicy)
		flags |= DST_NOPOLICY;
	if (rt->dst_host)
		flags |= DST_HOST;

	return flags;
}

919
static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940
{
	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);

	switch (ort->fib6_type) {
	case RTN_BLACKHOLE:
		rt->dst.output = dst_discard_out;
		rt->dst.input = dst_discard;
		break;
	case RTN_PROHIBIT:
		rt->dst.output = ip6_pkt_prohibit_out;
		rt->dst.input = ip6_pkt_prohibit;
		break;
	case RTN_THROW:
	case RTN_UNREACHABLE:
	default:
		rt->dst.output = ip6_pkt_discard_out;
		rt->dst.input = ip6_pkt_discard;
		break;
	}
}

941
static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
942
{
943 944
	rt->dst.flags |= fib6_info_dst_flags(ort);

945
	if (ort->fib6_flags & RTF_REJECT) {
946 947 948 949 950 951 952 953 954
		ip6_rt_init_dst_reject(rt, ort);
		return;
	}

	rt->dst.error = 0;
	rt->dst.output = ip6_output;

	if (ort->fib6_type == RTN_LOCAL) {
		rt->dst.input = ip6_input;
955
	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
956 957 958 959 960 961 962 963 964 965 966 967 968
		rt->dst.input = ip6_mc_input;
	} else {
		rt->dst.input = ip6_forward;
	}

	if (ort->fib6_nh.nh_lwtstate) {
		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
		lwtunnel_set_redirect(&rt->dst);
	}

	rt->dst.lastuse = jiffies;
}

969
static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
970 971
{
	rt->rt6i_flags &= ~RTF_EXPIRES;
972
	fib6_info_hold(from);
973
	rcu_assign_pointer(rt->from, from);
974 975 976 977 978
	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
	if (from->fib6_metrics != &dst_default_metrics) {
		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
		refcount_inc(&from->fib6_metrics->refcnt);
	}
979 980
}

981
static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
982
{
D
David Ahern 已提交
983 984
	struct net_device *dev = fib6_info_nh_dev(ort);

985 986
	ip6_rt_init_dst(rt, ort);

987
	rt->rt6i_dst = ort->fib6_dst;
D
David Ahern 已提交
988
	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
989
	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
990
	rt->rt6i_flags = ort->fib6_flags;
991 992
	rt6_set_from(rt, ort);
#ifdef CONFIG_IPV6_SUBTREES
993
	rt->rt6i_src = ort->fib6_src;
994
#endif
995
	rt->rt6i_prefsrc = ort->fib6_prefsrc;
996
	rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
997 998
}

M
Martin KaFai Lau 已提交
999 1000 1001
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
					struct in6_addr *saddr)
{
1002
	struct fib6_node *pn, *sn;
M
Martin KaFai Lau 已提交
1003 1004 1005
	while (1) {
		if (fn->fn_flags & RTN_TL_ROOT)
			return NULL;
1006 1007 1008
		pn = rcu_dereference(fn->parent);
		sn = FIB6_SUBTREE(pn);
		if (sn && sn != fn)
1009
			fn = fib6_node_lookup(sn, NULL, saddr);
M
Martin KaFai Lau 已提交
1010 1011 1012 1013 1014 1015
		else
			fn = pn;
		if (fn->fn_flags & RTN_RTINFO)
			return fn;
	}
}
T
Thomas Graf 已提交
1016

1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033
static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
			  bool null_fallback)
{
	struct rt6_info *rt = *prt;

	if (dst_hold_safe(&rt->dst))
		return true;
	if (null_fallback) {
		rt = net->ipv6.ip6_null_entry;
		dst_hold(&rt->dst);
	} else {
		rt = NULL;
	}
	*prt = rt;
	return false;
}

1034
/* called with rcu_lock held */
1035
static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1036
{
1037
	unsigned short flags = fib6_info_dst_flags(rt);
1038 1039 1040
	struct net_device *dev = rt->fib6_nh.nh_dev;
	struct rt6_info *nrt;

1041
	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1042 1043 1044 1045 1046 1047
	if (nrt)
		ip6_rt_copy_init(nrt, rt);

	return nrt;
}

1048 1049
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
D
David Ahern 已提交
1050 1051 1052
					     struct flowi6 *fl6,
					     const struct sk_buff *skb,
					     int flags)
L
Linus Torvalds 已提交
1053
{
1054
	struct fib6_info *f6i;
L
Linus Torvalds 已提交
1055
	struct fib6_node *fn;
1056
	struct rt6_info *rt;
L
Linus Torvalds 已提交
1057

1058 1059 1060
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		flags &= ~RT6_LOOKUP_F_IFACE;

1061
	rcu_read_lock();
1062
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
T
Thomas Graf 已提交
1063
restart:
1064 1065 1066
	f6i = rcu_dereference(fn->leaf);
	if (!f6i) {
		f6i = net->ipv6.fib6_null_entry;
1067
	} else {
1068
		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1069
				      fl6->flowi6_oif, flags);
1070
		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1071 1072
			f6i = rt6_multipath_select(net, f6i, fl6,
						   fl6->flowi6_oif, skb, flags);
1073
	}
1074
	if (f6i == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
1075 1076 1077 1078
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
	}
1079

1080
	/* Search through exception table */
1081 1082
	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
	if (rt) {
1083 1084
		if (ip6_hold_safe(net, &rt, true))
			dst_use_noref(&rt->dst, jiffies);
1085
	} else if (f6i == net->ipv6.fib6_null_entry) {
1086 1087
		rt = net->ipv6.ip6_null_entry;
		dst_hold(&rt->dst);
1088 1089 1090 1091 1092 1093
	} else {
		rt = ip6_create_rt_rcu(f6i);
		if (!rt) {
			rt = net->ipv6.ip6_null_entry;
			dst_hold(&rt->dst);
		}
1094
	}
1095

1096
	rcu_read_unlock();
D
David Ahern 已提交
1097

1098
	trace_fib6_table_lookup(net, rt, table, fl6);
D
David Ahern 已提交
1099

T
Thomas Graf 已提交
1100 1101 1102
	return rt;
}

1103
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
D
David Ahern 已提交
1104
				   const struct sk_buff *skb, int flags)
F
Florian Westphal 已提交
1105
{
D
David Ahern 已提交
1106
	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
F
Florian Westphal 已提交
1107 1108 1109
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

1110
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
D
David Ahern 已提交
1111 1112
			    const struct in6_addr *saddr, int oif,
			    const struct sk_buff *skb, int strict)
T
Thomas Graf 已提交
1113
{
1114 1115 1116
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
T
Thomas Graf 已提交
1117 1118
	};
	struct dst_entry *dst;
1119
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
T
Thomas Graf 已提交
1120

1121
	if (saddr) {
1122
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1123 1124 1125
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

D
David Ahern 已提交
1126
	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
T
Thomas Graf 已提交
1127 1128 1129 1130 1131
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

L
Linus Torvalds 已提交
1132 1133
	return NULL;
}
1134 1135
EXPORT_SYMBOL(rt6_lookup);

T
Thomas Graf 已提交
1136
/* ip6_ins_rt is called with FREE table->tb6_lock.
1137 1138 1139
 * It takes new route entry, the addition fails by any reason the
 * route is released.
 * Caller must hold dst before calling it.
L
Linus Torvalds 已提交
1140 1141
 */

1142
static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1143
			struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
1144 1145
{
	int err;
T
Thomas Graf 已提交
1146
	struct fib6_table *table;
L
Linus Torvalds 已提交
1147

1148
	table = rt->fib6_table;
1149
	spin_lock_bh(&table->tb6_lock);
1150
	err = fib6_add(&table->tb6_root, rt, info, extack);
1151
	spin_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1152 1153 1154 1155

	return err;
}

1156
int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1157
{
1158
	struct nl_info info = {	.nl_net = net, };
1159

1160
	return __ip6_ins_rt(rt, &info, NULL);
1161 1162
}

1163
static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1164 1165
					   const struct in6_addr *daddr,
					   const struct in6_addr *saddr)
L
Linus Torvalds 已提交
1166
{
1167
	struct net_device *dev;
L
Linus Torvalds 已提交
1168 1169 1170 1171 1172 1173
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

1174
	dev = ip6_rt_get_dev_rcu(ort);
1175
	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
M
Martin KaFai Lau 已提交
1176 1177 1178 1179 1180 1181 1182 1183
	if (!rt)
		return NULL;

	ip6_rt_copy_init(rt, ort);
	rt->rt6i_flags |= RTF_CACHE;
	rt->dst.flags |= DST_HOST;
	rt->rt6i_dst.addr = *daddr;
	rt->rt6i_dst.plen = 128;
L
Linus Torvalds 已提交
1184

M
Martin KaFai Lau 已提交
1185
	if (!rt6_is_gw_or_nonexthop(ort)) {
1186 1187
		if (ort->fib6_dst.plen != 128 &&
		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
M
Martin KaFai Lau 已提交
1188
			rt->rt6i_flags |= RTF_ANYCAST;
L
Linus Torvalds 已提交
1189
#ifdef CONFIG_IPV6_SUBTREES
M
Martin KaFai Lau 已提交
1190 1191 1192
		if (rt->rt6i_src.plen && saddr) {
			rt->rt6i_src.addr = *saddr;
			rt->rt6i_src.plen = 128;
1193
		}
M
Martin KaFai Lau 已提交
1194
#endif
1195
	}
L
Linus Torvalds 已提交
1196

1197 1198
	return rt;
}
L
Linus Torvalds 已提交
1199

1200
static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
M
Martin KaFai Lau 已提交
1201
{
1202
	unsigned short flags = fib6_info_dst_flags(rt);
1203
	struct net_device *dev;
M
Martin KaFai Lau 已提交
1204 1205
	struct rt6_info *pcpu_rt;

1206 1207
	rcu_read_lock();
	dev = ip6_rt_get_dev_rcu(rt);
1208
	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1209
	rcu_read_unlock();
M
Martin KaFai Lau 已提交
1210 1211 1212 1213 1214 1215 1216
	if (!pcpu_rt)
		return NULL;
	ip6_rt_copy_init(pcpu_rt, rt);
	pcpu_rt->rt6i_flags |= RTF_PCPU;
	return pcpu_rt;
}

1217
/* It should be called with rcu_read_lock() acquired */
1218
static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
M
Martin KaFai Lau 已提交
1219
{
1220
	struct rt6_info *pcpu_rt, **p;
M
Martin KaFai Lau 已提交
1221 1222 1223 1224

	p = this_cpu_ptr(rt->rt6i_pcpu);
	pcpu_rt = *p;

1225 1226
	if (pcpu_rt)
		ip6_hold_safe(NULL, &pcpu_rt, false);
1227

1228 1229 1230
	return pcpu_rt;
}

1231
static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1232
					    struct fib6_info *rt)
1233 1234
{
	struct rt6_info *pcpu_rt, *prev, **p;
M
Martin KaFai Lau 已提交
1235 1236 1237

	pcpu_rt = ip6_rt_pcpu_alloc(rt);
	if (!pcpu_rt) {
1238 1239
		dst_hold(&net->ipv6.ip6_null_entry->dst);
		return net->ipv6.ip6_null_entry;
M
Martin KaFai Lau 已提交
1240 1241
	}

1242 1243 1244
	dst_hold(&pcpu_rt->dst);
	p = this_cpu_ptr(rt->rt6i_pcpu);
	prev = cmpxchg(p, NULL, pcpu_rt);
1245
	BUG_ON(prev);
1246

M
Martin KaFai Lau 已提交
1247 1248 1249
	return pcpu_rt;
}

1250 1251 1252 1253 1254 1255 1256 1257 1258 1259
/* exception hash table implementation
 */
static DEFINE_SPINLOCK(rt6_exception_lock);

/* Remove rt6_ex from hash table and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
				 struct rt6_exception *rt6_ex)
{
1260
	struct net *net;
W
Wei Wang 已提交
1261

1262 1263
	if (!bucket || !rt6_ex)
		return;
1264 1265

	net = dev_net(rt6_ex->rt6i->dst.dev);
1266
	hlist_del_rcu(&rt6_ex->hlist);
1267
	dst_release(&rt6_ex->rt6i->dst);
1268 1269 1270
	kfree_rcu(rt6_ex, rcu);
	WARN_ON_ONCE(!bucket->depth);
	bucket->depth--;
W
Wei Wang 已提交
1271
	net->ipv6.rt6_stats->fib_rt_cache--;
1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374
}

/* Remove oldest rt6_ex in bucket and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
{
	struct rt6_exception *rt6_ex, *oldest = NULL;

	if (!bucket)
		return;

	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
			oldest = rt6_ex;
	}
	rt6_remove_exception(bucket, oldest);
}

static u32 rt6_exception_hash(const struct in6_addr *dst,
			      const struct in6_addr *src)
{
	static u32 seed __read_mostly;
	u32 val;

	net_get_random_once(&seed, sizeof(seed));
	val = jhash(dst, sizeof(*dst), seed);

#ifdef CONFIG_IPV6_SUBTREES
	if (src)
		val = jhash(src, sizeof(*src), val);
#endif
	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rt6_exception_lock
 */
static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
			      const struct in6_addr *daddr,
			      const struct in6_addr *saddr)
{
	struct rt6_exception *rt6_ex;
	u32 hval;

	if (!(*bucket) || !daddr)
		return NULL;

	hval = rt6_exception_hash(daddr, saddr);
	*bucket += hval;

	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
		struct rt6_info *rt6 = rt6_ex->rt6i;
		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
		if (matched && saddr)
			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
		if (matched)
			return rt6_ex;
	}
	return NULL;
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rcu_read_lock()
 */
static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
			 const struct in6_addr *daddr,
			 const struct in6_addr *saddr)
{
	struct rt6_exception *rt6_ex;
	u32 hval;

	WARN_ON_ONCE(!rcu_read_lock_held());

	if (!(*bucket) || !daddr)
		return NULL;

	hval = rt6_exception_hash(daddr, saddr);
	*bucket += hval;

	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
		struct rt6_info *rt6 = rt6_ex->rt6i;
		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
		if (matched && saddr)
			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
		if (matched)
			return rt6_ex;
	}
	return NULL;
}

1375
static unsigned int fib6_mtu(const struct fib6_info *rt)
1376 1377 1378
{
	unsigned int mtu;

D
David Ahern 已提交
1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390
	if (rt->fib6_pmtu) {
		mtu = rt->fib6_pmtu;
	} else {
		struct net_device *dev = fib6_info_nh_dev(rt);
		struct inet6_dev *idev;

		rcu_read_lock();
		idev = __in6_dev_get(dev);
		mtu = idev->cnf.mtu6;
		rcu_read_unlock();
	}

1391 1392 1393 1394 1395
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
}

1396
static int rt6_insert_exception(struct rt6_info *nrt,
1397
				struct fib6_info *ort)
1398
{
1399
	struct net *net = dev_net(nrt->dst.dev);
1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	int err = 0;

	spin_lock_bh(&rt6_exception_lock);

	if (ort->exception_bucket_flushed) {
		err = -EINVAL;
		goto out;
	}

	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));
	if (!bucket) {
		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
				 GFP_ATOMIC);
		if (!bucket) {
			err = -ENOMEM;
			goto out;
		}
		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
	}

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates ort is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1431
	if (ort->fib6_src.plen)
1432 1433
		src_key = &nrt->rt6i_src.addr;
#endif
1434 1435 1436 1437

	/* Update rt6i_prefsrc as it could be changed
	 * in rt6_remove_prefsrc()
	 */
1438
	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1439 1440 1441 1442
	/* rt6_mtu_change() might lower mtu on ort.
	 * Only insert this exception route if its mtu
	 * is less than ort's mtu value.
	 */
1443
	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1444 1445 1446
		err = -EINVAL;
		goto out;
	}
1447

1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461
	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
					       src_key);
	if (rt6_ex)
		rt6_remove_exception(bucket, rt6_ex);

	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
	if (!rt6_ex) {
		err = -ENOMEM;
		goto out;
	}
	rt6_ex->rt6i = nrt;
	rt6_ex->stamp = jiffies;
	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
	bucket->depth++;
W
Wei Wang 已提交
1462
	net->ipv6.rt6_stats->fib_rt_cache++;
1463 1464 1465 1466 1467 1468 1469 1470

	if (bucket->depth > FIB6_MAX_DEPTH)
		rt6_exception_remove_oldest(bucket);

out:
	spin_unlock_bh(&rt6_exception_lock);

	/* Update fn->fn_sernum to invalidate all cached dst */
1471
	if (!err) {
1472
		spin_lock_bh(&ort->fib6_table->tb6_lock);
1473
		fib6_update_sernum(net, ort);
1474
		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1475 1476
		fib6_force_start_gc(net);
	}
1477 1478 1479 1480

	return err;
}

1481
void rt6_flush_exceptions(struct fib6_info *rt)
1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	spin_lock_bh(&rt6_exception_lock);
	/* Prevent rt6_insert_exception() to recreate the bucket list */
	rt->exception_bucket_flushed = 1;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));
	if (!bucket)
		goto out;

	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
			rt6_remove_exception(bucket, rt6_ex);
		WARN_ON_ONCE(bucket->depth);
		bucket++;
	}

out:
	spin_unlock_bh(&rt6_exception_lock);
}

/* Find cached rt in the hash table inside passed in rt
 * Caller has to hold rcu_read_lock()
 */
1511
static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528
					   struct in6_addr *daddr,
					   struct in6_addr *saddr)
{
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	struct rt6_info *res = NULL;

	bucket = rcu_dereference(rt->rt6i_exception_bucket);

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates rt is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1529
	if (rt->fib6_src.plen)
1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540
		src_key = saddr;
#endif
	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);

	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
		res = rt6_ex->rt6i;

	return res;
}

/* Remove the passed in cached rt from the hash table that contains it */
1541
static int rt6_remove_exception_rt(struct rt6_info *rt)
1542 1543 1544 1545
{
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
1546
	struct fib6_info *from;
1547 1548
	int err;

1549
	from = rcu_dereference(rt->from);
1550
	if (!from ||
1551
	    !(rt->rt6i_flags & RTF_CACHE))
1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566
		return -EINVAL;

	if (!rcu_access_pointer(from->rt6i_exception_bucket))
		return -ENOENT;

	spin_lock_bh(&rt6_exception_lock);
	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));
#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates 'from' is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1567
	if (from->fib6_src.plen)
1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589
		src_key = &rt->rt6i_src.addr;
#endif
	rt6_ex = __rt6_find_exception_spinlock(&bucket,
					       &rt->rt6i_dst.addr,
					       src_key);
	if (rt6_ex) {
		rt6_remove_exception(bucket, rt6_ex);
		err = 0;
	} else {
		err = -ENOENT;
	}

	spin_unlock_bh(&rt6_exception_lock);
	return err;
}

/* Find rt6_ex which contains the passed in rt cache and
 * refresh its stamp
 */
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
	struct rt6_exception_bucket *bucket;
1590
	struct fib6_info *from = rt->from;
1591 1592 1593 1594
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;

	if (!from ||
1595
	    !(rt->rt6i_flags & RTF_CACHE))
1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607
		return;

	rcu_read_lock();
	bucket = rcu_dereference(from->rt6i_exception_bucket);

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates 'from' is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1608
	if (from->fib6_src.plen)
1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619
		src_key = &rt->rt6i_src.addr;
#endif
	rt6_ex = __rt6_find_exception_rcu(&bucket,
					  &rt->rt6i_dst.addr,
					  src_key);
	if (rt6_ex)
		rt6_ex->stamp = jiffies;

	rcu_read_unlock();
}

1620
static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	int i;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
			}
			bucket++;
		}
	}
}

1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661
static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
					 struct rt6_info *rt, int mtu)
{
	/* If the new MTU is lower than the route PMTU, this new MTU will be the
	 * lowest MTU in the path: always allow updating the route PMTU to
	 * reflect PMTU decreases.
	 *
	 * If the new MTU is higher, and the route PMTU is equal to the local
	 * MTU, this means the old MTU is the lowest in the path, so allow
	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
	 * handle this.
	 */

	if (dst_mtu(&rt->dst) >= mtu)
		return true;

	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
		return true;

	return false;
}

static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1662
				       struct fib6_info *rt, int mtu)
1663 1664 1665 1666 1667 1668 1669 1670
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	int i;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));

1671 1672 1673 1674 1675 1676 1677 1678
	if (!bucket)
		return;

	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
			struct rt6_info *entry = rt6_ex->rt6i;

			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1679
			 * route), the metrics of its rt->from have already
1680 1681
			 * been updated.
			 */
1682
			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1683
			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1684
				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1685
		}
1686
		bucket++;
1687 1688 1689
	}
}

1690 1691
#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)

1692
static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1693 1694 1695 1696 1697 1698 1699 1700 1701 1702 1703 1704 1705 1706 1707 1708 1709 1710 1711 1712 1713 1714 1715 1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726
					struct in6_addr *gateway)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
		return;

	spin_lock_bh(&rt6_exception_lock);
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				     lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry_safe(rt6_ex, tmp,
						  &bucket->chain, hlist) {
				struct rt6_info *entry = rt6_ex->rt6i;

				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
				    RTF_CACHE_GATEWAY &&
				    ipv6_addr_equal(gateway,
						    &entry->rt6i_gateway)) {
					rt6_remove_exception(bucket, rt6_ex);
				}
			}
			bucket++;
		}
	}

	spin_unlock_bh(&rt6_exception_lock);
}

1727 1728 1729 1730 1731 1732 1733
static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
				      struct rt6_exception *rt6_ex,
				      struct fib6_gc_args *gc_args,
				      unsigned long now)
{
	struct rt6_info *rt = rt6_ex->rt6i;

1734 1735 1736 1737 1738 1739
	/* we are pruning and obsoleting aged-out and non gateway exceptions
	 * even if others have still references to them, so that on next
	 * dst_check() such references can be dropped.
	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
	 * expired, independently from their aging, as per RFC 8201 section 4
	 */
W
Wei Wang 已提交
1740 1741 1742 1743 1744 1745 1746 1747
	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
			RT6_TRACE("aging clone %p\n", rt);
			rt6_remove_exception(bucket, rt6_ex);
			return;
		}
	} else if (time_after(jiffies, rt->dst.expires)) {
		RT6_TRACE("purging expired route %p\n", rt);
1748 1749
		rt6_remove_exception(bucket, rt6_ex);
		return;
W
Wei Wang 已提交
1750 1751 1752
	}

	if (rt->rt6i_flags & RTF_GATEWAY) {
1753 1754 1755
		struct neighbour *neigh;
		__u8 neigh_flags = 0;

1756 1757
		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
		if (neigh)
1758
			neigh_flags = neigh->flags;
1759

1760 1761 1762 1763 1764 1765 1766
		if (!(neigh_flags & NTF_ROUTER)) {
			RT6_TRACE("purging route %p via non-router but gateway\n",
				  rt);
			rt6_remove_exception(bucket, rt6_ex);
			return;
		}
	}
W
Wei Wang 已提交
1767

1768 1769 1770
	gc_args->more++;
}

1771
void rt6_age_exceptions(struct fib6_info *rt,
1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782
			struct fib6_gc_args *gc_args,
			unsigned long now)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
		return;

1783 1784
	rcu_read_lock_bh();
	spin_lock(&rt6_exception_lock);
1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry_safe(rt6_ex, tmp,
						  &bucket->chain, hlist) {
				rt6_age_examine_exception(bucket, rt6_ex,
							  gc_args, now);
			}
			bucket++;
		}
	}
1798 1799
	spin_unlock(&rt6_exception_lock);
	rcu_read_unlock_bh();
1800 1801
}

1802
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
D
David Ahern 已提交
1803 1804
			       int oif, struct flowi6 *fl6,
			       const struct sk_buff *skb, int flags)
L
Linus Torvalds 已提交
1805
{
1806
	struct fib6_node *fn, *saved_fn;
1807
	struct fib6_info *f6i;
1808
	struct rt6_info *rt;
T
Thomas Graf 已提交
1809
	int strict = 0;
L
Linus Torvalds 已提交
1810

1811
	strict |= flags & RT6_LOOKUP_F_IFACE;
1812
	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1813 1814
	if (net->ipv6.devconf_all->forwarding == 0)
		strict |= RT6_LOOKUP_F_REACHABLE;
L
Linus Torvalds 已提交
1815

1816
	rcu_read_lock();
L
Linus Torvalds 已提交
1817

1818
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1819
	saved_fn = fn;
L
Linus Torvalds 已提交
1820

D
David Ahern 已提交
1821 1822 1823
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		oif = 0;

M
Martin KaFai Lau 已提交
1824
redo_rt6_select:
1825
	f6i = rt6_select(net, fn, oif, strict);
1826
	if (f6i->fib6_nsiblings)
1827 1828
		f6i = rt6_multipath_select(net, f6i, fl6, oif, skb, strict);
	if (f6i == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
1829 1830 1831
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto redo_rt6_select;
1832 1833 1834 1835 1836 1837
		else if (strict & RT6_LOOKUP_F_REACHABLE) {
			/* also consider unreachable route */
			strict &= ~RT6_LOOKUP_F_REACHABLE;
			fn = saved_fn;
			goto redo_rt6_select;
		}
M
Martin KaFai Lau 已提交
1838 1839
	}

1840
	if (f6i == net->ipv6.fib6_null_entry) {
D
David Ahern 已提交
1841
		rt = net->ipv6.ip6_null_entry;
1842
		rcu_read_unlock();
1843
		dst_hold(&rt->dst);
1844
		trace_fib6_table_lookup(net, rt, table, fl6);
1845
		return rt;
1846 1847 1848 1849 1850
	}

	/*Search through exception table */
	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
	if (rt) {
1851
		if (ip6_hold_safe(net, &rt, true))
1852
			dst_use_noref(&rt->dst, jiffies);
1853

1854
		rcu_read_unlock();
1855
		trace_fib6_table_lookup(net, rt, table, fl6);
M
Martin KaFai Lau 已提交
1856
		return rt;
1857
	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1858
			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1859 1860 1861 1862 1863 1864 1865
		/* Create a RTF_CACHE clone which will not be
		 * owned by the fib6 tree.  It is for the special case where
		 * the daddr in the skb during the neighbor look-up is different
		 * from the fl6->daddr used to look-up route here.
		 */
		struct rt6_info *uncached_rt;

1866
		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1867 1868

		rcu_read_unlock();
T
Thomas Graf 已提交
1869

1870 1871 1872 1873
		if (uncached_rt) {
			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
			 * No need for another dst_hold()
			 */
1874
			rt6_uncached_list_add(uncached_rt);
W
Wei Wang 已提交
1875
			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1876
		} else {
1877
			uncached_rt = net->ipv6.ip6_null_entry;
1878 1879
			dst_hold(&uncached_rt->dst);
		}
D
David Ahern 已提交
1880

1881
		trace_fib6_table_lookup(net, uncached_rt, table, fl6);
1882 1883
		return uncached_rt;

M
Martin KaFai Lau 已提交
1884 1885 1886 1887 1888
	} else {
		/* Get a percpu copy */

		struct rt6_info *pcpu_rt;

1889
		local_bh_disable();
1890
		pcpu_rt = rt6_get_pcpu_route(f6i);
M
Martin KaFai Lau 已提交
1891

1892 1893 1894
		if (!pcpu_rt)
			pcpu_rt = rt6_make_pcpu_route(net, f6i);

1895 1896
		local_bh_enable();
		rcu_read_unlock();
1897
		trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
M
Martin KaFai Lau 已提交
1898 1899
		return pcpu_rt;
	}
L
Linus Torvalds 已提交
1900
}
1901
EXPORT_SYMBOL_GPL(ip6_pol_route);
L
Linus Torvalds 已提交
1902

D
David Ahern 已提交
1903 1904 1905 1906 1907
static struct rt6_info *ip6_pol_route_input(struct net *net,
					    struct fib6_table *table,
					    struct flowi6 *fl6,
					    const struct sk_buff *skb,
					    int flags)
1908
{
D
David Ahern 已提交
1909
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1910 1911
}

1912 1913
struct dst_entry *ip6_route_input_lookup(struct net *net,
					 struct net_device *dev,
D
David Ahern 已提交
1914 1915 1916
					 struct flowi6 *fl6,
					 const struct sk_buff *skb,
					 int flags)
1917 1918 1919 1920
{
	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
		flags |= RT6_LOOKUP_F_IFACE;

D
David Ahern 已提交
1921
	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1922
}
1923
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1924

1925
static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1926 1927
				  struct flow_keys *keys,
				  struct flow_keys *flkeys)
1928 1929 1930
{
	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
	const struct ipv6hdr *key_iph = outer_iph;
1931
	struct flow_keys *_flkeys = flkeys;
1932 1933 1934
	const struct ipv6hdr *inner_iph;
	const struct icmp6hdr *icmph;
	struct ipv6hdr _inner_iph;
1935
	struct icmp6hdr _icmph;
1936 1937 1938 1939

	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
		goto out;

1940 1941 1942 1943 1944
	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
				   sizeof(_icmph), &_icmph);
	if (!icmph)
		goto out;

1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957
	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
	    icmph->icmp6_type != ICMPV6_PARAMPROB)
		goto out;

	inner_iph = skb_header_pointer(skb,
				       skb_transport_offset(skb) + sizeof(*icmph),
				       sizeof(_inner_iph), &_inner_iph);
	if (!inner_iph)
		goto out;

	key_iph = inner_iph;
1958
	_flkeys = NULL;
1959
out:
1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970
	if (_flkeys) {
		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
		keys->tags.flow_label = _flkeys->tags.flow_label;
		keys->basic.ip_proto = _flkeys->basic.ip_proto;
	} else {
		keys->addrs.v6addrs.src = key_iph->saddr;
		keys->addrs.v6addrs.dst = key_iph->daddr;
		keys->tags.flow_label = ip6_flowinfo(key_iph);
		keys->basic.ip_proto = key_iph->nexthdr;
	}
1971 1972 1973
}

/* if skb is set it will be used and fl6 can be NULL */
1974 1975
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
		       const struct sk_buff *skb, struct flow_keys *flkeys)
1976 1977
{
	struct flow_keys hash_keys;
1978
	u32 mhash;
1979

1980
	switch (ip6_multipath_hash_policy(net)) {
1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
	case 0:
		memset(&hash_keys, 0, sizeof(hash_keys));
		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
		if (skb) {
			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
		} else {
			hash_keys.addrs.v6addrs.src = fl6->saddr;
			hash_keys.addrs.v6addrs.dst = fl6->daddr;
			hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
			hash_keys.basic.ip_proto = fl6->flowi6_proto;
		}
		break;
	case 1:
		if (skb) {
			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
			struct flow_keys keys;

			/* short-circuit if we already have L4 hash present */
			if (skb->l4_hash)
				return skb_get_hash_raw(skb) >> 1;

			memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
				skb_flow_dissect_flow_keys(skb, &keys, flag);
				flkeys = &keys;
			}
			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
			hash_keys.ports.src = flkeys->ports.src;
			hash_keys.ports.dst = flkeys->ports.dst;
			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
		} else {
			memset(&hash_keys, 0, sizeof(hash_keys));
			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			hash_keys.addrs.v6addrs.src = fl6->saddr;
			hash_keys.addrs.v6addrs.dst = fl6->daddr;
			hash_keys.ports.src = fl6->fl6_sport;
			hash_keys.ports.dst = fl6->fl6_dport;
			hash_keys.basic.ip_proto = fl6->flowi6_proto;
		}
		break;
2024
	}
2025
	mhash = flow_hash_from_keys(&hash_keys);
2026

2027
	return mhash >> 1;
2028 2029
}

T
Thomas Graf 已提交
2030 2031
void ip6_route_input(struct sk_buff *skb)
{
2032
	const struct ipv6hdr *iph = ipv6_hdr(skb);
2033
	struct net *net = dev_net(skb->dev);
2034
	int flags = RT6_LOOKUP_F_HAS_SADDR;
2035
	struct ip_tunnel_info *tun_info;
2036
	struct flowi6 fl6 = {
2037
		.flowi6_iif = skb->dev->ifindex,
2038 2039
		.daddr = iph->daddr,
		.saddr = iph->saddr,
2040
		.flowlabel = ip6_flowinfo(iph),
2041 2042
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
T
Thomas Graf 已提交
2043
	};
2044
	struct flow_keys *flkeys = NULL, _flkeys;
2045

2046
	tun_info = skb_tunnel_info(skb);
2047
	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2048
		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2049 2050 2051 2052

	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
		flkeys = &_flkeys;

2053
	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2054
		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2055
	skb_dst_drop(skb);
D
David Ahern 已提交
2056 2057
	skb_dst_set(skb,
		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
T
Thomas Graf 已提交
2058 2059
}

D
David Ahern 已提交
2060 2061 2062 2063 2064
static struct rt6_info *ip6_pol_route_output(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
					     const struct sk_buff *skb,
					     int flags)
L
Linus Torvalds 已提交
2065
{
D
David Ahern 已提交
2066
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
T
Thomas Graf 已提交
2067 2068
}

2069 2070
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
					 struct flowi6 *fl6, int flags)
T
Thomas Graf 已提交
2071
{
2072
	bool any_src;
T
Thomas Graf 已提交
2073

2074 2075 2076 2077 2078 2079 2080
	if (rt6_need_strict(&fl6->daddr)) {
		struct dst_entry *dst;

		dst = l3mdev_link_scope_lookup(net, fl6);
		if (dst)
			return dst;
	}
D
David Ahern 已提交
2081

2082
	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2083

2084
	any_src = ipv6_addr_any(&fl6->saddr);
2085
	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2086
	    (fl6->flowi6_oif && any_src))
2087
		flags |= RT6_LOOKUP_F_IFACE;
T
Thomas Graf 已提交
2088

2089
	if (!any_src)
2090
		flags |= RT6_LOOKUP_F_HAS_SADDR;
2091 2092
	else if (sk)
		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2093

D
David Ahern 已提交
2094
	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
L
Linus Torvalds 已提交
2095
}
2096
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
L
Linus Torvalds 已提交
2097

2098
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2099
{
2100
	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2101
	struct net_device *loopback_dev = net->loopback_dev;
2102 2103
	struct dst_entry *new = NULL;

2104
	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2105
		       DST_OBSOLETE_DEAD, 0);
2106
	if (rt) {
2107
		rt6_info_init(rt);
W
Wei Wang 已提交
2108
		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2109

2110
		new = &rt->dst;
2111
		new->__use = 1;
2112
		new->input = dst_discard;
E
Eric W. Biederman 已提交
2113
		new->output = dst_discard_out;
2114

2115
		dst_copy_metrics(new, &ort->dst);
2116

2117
		rt->rt6i_idev = in6_dev_get(loopback_dev);
A
Alexey Dobriyan 已提交
2118
		rt->rt6i_gateway = ort->rt6i_gateway;
2119
		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2120 2121 2122 2123 2124 2125 2126

		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
	}

2127 2128
	dst_release(dst_orig);
	return new ? new : ERR_PTR(-ENOMEM);
2129 2130
}

L
Linus Torvalds 已提交
2131 2132 2133 2134
/*
 *	Destination cache support functions
 */

2135
static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2136 2137 2138
{
	u32 rt_cookie = 0;

2139
	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2140 2141 2142 2143 2144 2145 2146 2147
		return false;

	if (fib6_check_expired(f6i))
		return false;

	return true;
}

2148 2149 2150
static struct dst_entry *rt6_check(struct rt6_info *rt,
				   struct fib6_info *from,
				   u32 cookie)
2151
{
2152
	u32 rt_cookie = 0;
2153

2154
	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2155
	    rt_cookie != cookie)
2156 2157 2158 2159 2160 2161 2162 2163
		return NULL;

	if (rt6_check_expired(rt))
		return NULL;

	return &rt->dst;
}

2164 2165 2166
static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
					    struct fib6_info *from,
					    u32 cookie)
2167
{
2168 2169
	if (!__rt6_check_expired(rt) &&
	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2170
	    fib6_check(from, cookie))
2171 2172 2173 2174 2175
		return &rt->dst;
	else
		return NULL;
}

L
Linus Torvalds 已提交
2176 2177
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
2178
	struct dst_entry *dst_ret;
2179
	struct fib6_info *from;
L
Linus Torvalds 已提交
2180 2181
	struct rt6_info *rt;

2182 2183 2184
	rt = container_of(dst, struct rt6_info, dst);

	rcu_read_lock();
L
Linus Torvalds 已提交
2185

2186 2187 2188 2189
	/* All IPV6 dsts are created with ->obsolete set to the value
	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
	 * into this function always.
	 */
2190

2191 2192 2193 2194 2195
	from = rcu_dereference(rt->from);

	if (from && (rt->rt6i_flags & RTF_PCPU ||
	    unlikely(!list_empty(&rt->rt6i_uncached))))
		dst_ret = rt6_dst_from_check(rt, from, cookie);
2196
	else
2197
		dst_ret = rt6_check(rt, from, cookie);
2198 2199 2200 2201

	rcu_read_unlock();

	return dst_ret;
L
Linus Torvalds 已提交
2202 2203 2204 2205 2206 2207 2208
}

static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;

	if (rt) {
2209
		if (rt->rt6i_flags & RTF_CACHE) {
2210
			rcu_read_lock();
2211
			if (rt6_check_expired(rt)) {
2212
				rt6_remove_exception_rt(rt);
2213 2214
				dst = NULL;
			}
2215
			rcu_read_unlock();
2216
		} else {
L
Linus Torvalds 已提交
2217
			dst_release(dst);
2218 2219
			dst = NULL;
		}
L
Linus Torvalds 已提交
2220
	}
2221
	return dst;
L
Linus Torvalds 已提交
2222 2223 2224 2225 2226 2227
}

static void ip6_link_failure(struct sk_buff *skb)
{
	struct rt6_info *rt;

2228
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
L
Linus Torvalds 已提交
2229

E
Eric Dumazet 已提交
2230
	rt = (struct rt6_info *) skb_dst(skb);
L
Linus Torvalds 已提交
2231
	if (rt) {
2232
		rcu_read_lock();
2233
		if (rt->rt6i_flags & RTF_CACHE) {
W
Wei Wang 已提交
2234
			if (dst_hold_safe(&rt->dst))
2235
				rt6_remove_exception_rt(rt);
2236 2237
		} else {
			struct fib6_info *from;
2238 2239
			struct fib6_node *fn;

2240 2241 2242 2243 2244 2245
			from = rcu_dereference(rt->from);
			if (from) {
				fn = rcu_dereference(from->fib6_node);
				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
					fn->fn_sernum = -1;
			}
2246
		}
2247
		rcu_read_unlock();
L
Linus Torvalds 已提交
2248 2249 2250
	}
}

2251 2252
static void rt6_update_expires(struct rt6_info *rt0, int timeout)
{
2253 2254 2255 2256 2257 2258 2259 2260 2261
	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
		struct fib6_info *from;

		rcu_read_lock();
		from = rcu_dereference(rt0->from);
		if (from)
			rt0->dst.expires = from->expires;
		rcu_read_unlock();
	}
2262 2263 2264 2265 2266

	dst_set_expires(&rt0->dst, timeout);
	rt0->rt6i_flags |= RTF_EXPIRES;
}

2267 2268 2269 2270
static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
	struct net *net = dev_net(rt->dst.dev);

2271
	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2272 2273 2274 2275
	rt->rt6i_flags |= RTF_MODIFIED;
	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}

2276 2277
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
2278 2279 2280 2281 2282 2283
	bool from_set;

	rcu_read_lock();
	from_set = !!rcu_dereference(rt->from);
	rcu_read_unlock();

2284
	return !(rt->rt6i_flags & RTF_CACHE) &&
2285
		(rt->rt6i_flags & RTF_PCPU || from_set);
2286 2287
}

2288 2289
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
				 const struct ipv6hdr *iph, u32 mtu)
L
Linus Torvalds 已提交
2290
{
2291
	const struct in6_addr *daddr, *saddr;
2292
	struct rt6_info *rt6 = (struct rt6_info *)dst;
L
Linus Torvalds 已提交
2293

2294 2295
	if (rt6->rt6i_flags & RTF_LOCAL)
		return;
2296

2297 2298 2299
	if (dst_metric_locked(dst, RTAX_MTU))
		return;

2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310
	if (iph) {
		daddr = &iph->daddr;
		saddr = &iph->saddr;
	} else if (sk) {
		daddr = &sk->sk_v6_daddr;
		saddr = &inet6_sk(sk)->saddr;
	} else {
		daddr = NULL;
		saddr = NULL;
	}
	dst_confirm_neigh(dst, daddr);
2311 2312 2313
	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
	if (mtu >= dst_mtu(dst))
		return;
2314

2315
	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2316
		rt6_do_update_pmtu(rt6, mtu);
2317 2318 2319
		/* update rt6_ex->stamp for cache */
		if (rt6->rt6i_flags & RTF_CACHE)
			rt6_update_exception_stamp_rt(rt6);
2320
	} else if (daddr) {
2321
		struct fib6_info *from;
2322 2323
		struct rt6_info *nrt6;

2324
		rcu_read_lock();
2325 2326
		from = rcu_dereference(rt6->from);
		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2327 2328
		if (nrt6) {
			rt6_do_update_pmtu(nrt6, mtu);
2329
			if (rt6_insert_exception(nrt6, from))
2330
				dst_release_immediate(&nrt6->dst);
2331
		}
2332
		rcu_read_unlock();
L
Linus Torvalds 已提交
2333 2334 2335
	}
}

2336 2337 2338 2339 2340 2341
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
			       struct sk_buff *skb, u32 mtu)
{
	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
}

2342
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2343
		     int oif, u32 mark, kuid_t uid)
2344 2345 2346 2347 2348 2349 2350
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
2351
	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2352 2353
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
2354
	fl6.flowlabel = ip6_flowinfo(iph);
2355
	fl6.flowi6_uid = uid;
2356 2357 2358

	dst = ip6_route_output(net, NULL, &fl6);
	if (!dst->error)
2359
		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2360 2361 2362 2363 2364 2365
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
2366 2367
	struct dst_entry *dst;

2368
	ip6_update_pmtu(skb, sock_net(sk), mtu,
2369
			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2370 2371 2372 2373 2374 2375 2376 2377 2378 2379

	dst = __sk_dst_get(sk);
	if (!dst || !dst->obsolete ||
	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
		return;

	bh_lock_sock(sk);
	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		ip6_datagram_dst_update(sk, false);
	bh_unlock_sock(sk);
2380 2381 2382
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399
void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
			   const struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_SUBTREES
	struct ipv6_pinfo *np = inet6_sk(sk);
#endif

	ip6_dst_store(sk, dst,
		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
		      &sk->sk_v6_daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
		      &np->saddr :
#endif
		      NULL);
}

2400 2401 2402 2403 2404 2405 2406 2407 2408
/* Handle redirects */
struct ip6rd_flowi {
	struct flowi6 fl6;
	struct in6_addr gateway;
};

static struct rt6_info *__ip6_route_redirect(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
D
David Ahern 已提交
2409
					     const struct sk_buff *skb,
2410 2411 2412
					     int flags)
{
	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2413
	struct rt6_info *ret = NULL, *rt_cache;
2414
	struct fib6_info *rt;
2415 2416 2417
	struct fib6_node *fn;

	/* Get the "current" route for this destination and
A
Alexander Alemayhu 已提交
2418
	 * check if the redirect has come from appropriate router.
2419 2420 2421 2422 2423 2424 2425 2426
	 *
	 * RFC 4861 specifies that redirects should only be
	 * accepted if they come from the nexthop to the target.
	 * Due to the way the routes are chosen, this notion
	 * is a bit fuzzy and one might need to check all possible
	 * routes.
	 */

2427
	rcu_read_lock();
2428
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2429
restart:
2430
	for_each_fib6_node_rt_rcu(fn) {
2431
		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2432
			continue;
2433
		if (fib6_check_expired(rt))
2434
			continue;
2435
		if (rt->fib6_flags & RTF_REJECT)
2436
			break;
2437
		if (!(rt->fib6_flags & RTF_GATEWAY))
2438
			continue;
2439
		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2440
			continue;
2441 2442 2443 2444 2445
		/* rt_cache's gateway might be different from its 'parent'
		 * in the case of an ip redirect.
		 * So we keep searching in the exception table if the gateway
		 * is different.
		 */
2446
		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2447 2448 2449 2450 2451 2452
			rt_cache = rt6_find_cached_rt(rt,
						      &fl6->daddr,
						      &fl6->saddr);
			if (rt_cache &&
			    ipv6_addr_equal(&rdfl->gateway,
					    &rt_cache->rt6i_gateway)) {
2453
				ret = rt_cache;
2454 2455
				break;
			}
2456
			continue;
2457
		}
2458 2459 2460 2461
		break;
	}

	if (!rt)
D
David Ahern 已提交
2462
		rt = net->ipv6.fib6_null_entry;
2463
	else if (rt->fib6_flags & RTF_REJECT) {
2464
		ret = net->ipv6.ip6_null_entry;
2465 2466 2467
		goto out;
	}

D
David Ahern 已提交
2468
	if (rt == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
2469 2470 2471
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
2472
	}
M
Martin KaFai Lau 已提交
2473

2474
out:
2475 2476 2477 2478
	if (ret)
		dst_hold(&ret->dst);
	else
		ret = ip6_create_rt_rcu(rt);
2479

2480
	rcu_read_unlock();
2481

2482 2483
	trace_fib6_table_lookup(net, ret, table, fl6);
	return ret;
2484 2485 2486
};

static struct dst_entry *ip6_route_redirect(struct net *net,
D
David Ahern 已提交
2487 2488 2489
					    const struct flowi6 *fl6,
					    const struct sk_buff *skb,
					    const struct in6_addr *gateway)
2490 2491 2492 2493 2494 2495 2496
{
	int flags = RT6_LOOKUP_F_HAS_SADDR;
	struct ip6rd_flowi rdfl;

	rdfl.fl6 = *fl6;
	rdfl.gateway = *gateway;

D
David Ahern 已提交
2497
	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2498 2499 2500
				flags, __ip6_route_redirect);
}

2501 2502
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
		  kuid_t uid)
2503 2504 2505 2506 2507 2508
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
2509
	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2510 2511 2512 2513
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
2514
	fl6.flowlabel = ip6_flowinfo(iph);
2515
	fl6.flowi6_uid = uid;
2516

D
David Ahern 已提交
2517
	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2518
	rt6_do_redirect(dst, NULL, skb);
2519 2520 2521 2522
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

2523 2524 2525 2526 2527 2528 2529 2530 2531
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
			    u32 mark)
{
	const struct ipv6hdr *iph = ipv6_hdr(skb);
	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
2532
	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2533 2534 2535 2536
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.daddr = msg->dest;
	fl6.saddr = iph->daddr;
2537
	fl6.flowi6_uid = sock_net_uid(net, NULL);
2538

D
David Ahern 已提交
2539
	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2540
	rt6_do_redirect(dst, NULL, skb);
2541 2542 2543
	dst_release(dst);
}

2544 2545
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
2546 2547
	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
		     sk->sk_uid);
2548 2549 2550
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

2551
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
L
Linus Torvalds 已提交
2552
{
2553 2554 2555 2556
	struct net_device *dev = dst->dev;
	unsigned int mtu = dst_mtu(dst);
	struct net *net = dev_net(dev);

L
Linus Torvalds 已提交
2557 2558
	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

2559 2560
	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
L
Linus Torvalds 已提交
2561 2562

	/*
2563 2564 2565
	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
	 * IPV6_MAXPLEN is also valid and means: "any MSS,
L
Linus Torvalds 已提交
2566 2567 2568 2569 2570 2571 2572
	 * rely only on pmtu discovery"
	 */
	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
		mtu = IPV6_MAXPLEN;
	return mtu;
}

2573
static unsigned int ip6_mtu(const struct dst_entry *dst)
2574 2575
{
	struct inet6_dev *idev;
2576
	unsigned int mtu;
2577 2578

	mtu = dst_metric_raw(dst, RTAX_MTU);
2579
	if (mtu)
E
Eric Dumazet 已提交
2580
		goto out;
2581 2582

	mtu = IPV6_MIN_MTU;
2583 2584 2585 2586 2587 2588 2589

	rcu_read_lock();
	idev = __in6_dev_get(dst->dev);
	if (idev)
		mtu = idev->cnf.mtu6;
	rcu_read_unlock();

E
Eric Dumazet 已提交
2590
out:
2591 2592 2593
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2594 2595
}

2596
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2597
				  struct flowi6 *fl6)
L
Linus Torvalds 已提交
2598
{
2599
	struct dst_entry *dst;
L
Linus Torvalds 已提交
2600 2601
	struct rt6_info *rt;
	struct inet6_dev *idev = in6_dev_get(dev);
2602
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
2603

2604
	if (unlikely(!idev))
E
Eric Dumazet 已提交
2605
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
2606

2607
	rt = ip6_dst_alloc(net, dev, 0);
2608
	if (unlikely(!rt)) {
L
Linus Torvalds 已提交
2609
		in6_dev_put(idev);
2610
		dst = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
2611 2612 2613
		goto out;
	}

2614
	rt->dst.flags |= DST_HOST;
2615
	rt->dst.input = ip6_input;
2616
	rt->dst.output  = ip6_output;
2617
	rt->rt6i_gateway  = fl6->daddr;
2618
	rt->rt6i_dst.addr = fl6->daddr;
2619 2620
	rt->rt6i_dst.plen = 128;
	rt->rt6i_idev     = idev;
L
Li RongQing 已提交
2621
	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
L
Linus Torvalds 已提交
2622

2623
	/* Add this dst into uncached_list so that rt6_disable_ip() can
2624 2625 2626
	 * do proper release of the net_device
	 */
	rt6_uncached_list_add(rt);
W
Wei Wang 已提交
2627
	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
L
Linus Torvalds 已提交
2628

2629 2630
	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

L
Linus Torvalds 已提交
2631
out:
2632
	return dst;
L
Linus Torvalds 已提交
2633 2634
}

2635
static int ip6_dst_gc(struct dst_ops *ops)
L
Linus Torvalds 已提交
2636
{
2637
	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2638 2639 2640 2641 2642
	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2643
	int entries;
2644

2645
	entries = dst_entries_get_fast(ops);
2646
	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2647
	    entries <= rt_max_size)
L
Linus Torvalds 已提交
2648 2649
		goto out;

2650
	net->ipv6.ip6_rt_gc_expire++;
2651
	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2652 2653
	entries = dst_entries_get_slow(ops);
	if (entries < ops->gc_thresh)
2654
		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
L
Linus Torvalds 已提交
2655
out:
2656
	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2657
	return entries > rt_max_size;
L
Linus Torvalds 已提交
2658 2659
}

2660
static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2661
			       struct fib6_config *cfg)
2662
{
2663
	struct dst_metrics *p;
2664

2665 2666
	if (!cfg->fc_mx)
		return 0;
2667

2668 2669 2670
	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
	if (unlikely(!p))
		return -ENOMEM;
2671

2672 2673
	refcount_set(&p->refcnt, 1);
	rt->fib6_metrics = p;
2674

2675
	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2676
}
L
Linus Torvalds 已提交
2677

2678 2679
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
					    struct fib6_config *cfg,
2680 2681
					    const struct in6_addr *gw_addr,
					    u32 tbid, int flags)
2682 2683 2684 2685 2686 2687 2688 2689 2690
{
	struct flowi6 fl6 = {
		.flowi6_oif = cfg->fc_ifindex,
		.daddr = *gw_addr,
		.saddr = cfg->fc_prefsrc,
	};
	struct fib6_table *table;
	struct rt6_info *rt;

2691
	table = fib6_get_table(net, tbid);
2692 2693 2694 2695 2696 2697
	if (!table)
		return NULL;

	if (!ipv6_addr_any(&cfg->fc_prefsrc))
		flags |= RT6_LOOKUP_F_HAS_SADDR;

2698
	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
D
David Ahern 已提交
2699
	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2700 2701 2702 2703 2704 2705 2706 2707 2708 2709

	/* if table lookup failed, fall back to full lookup */
	if (rt == net->ipv6.ip6_null_entry) {
		ip6_rt_put(rt);
		rt = NULL;
	}

	return rt;
}

2710 2711
static int ip6_route_check_nh_onlink(struct net *net,
				     struct fib6_config *cfg,
2712
				     const struct net_device *dev,
2713 2714
				     struct netlink_ext_ack *extack)
{
2715
	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2716 2717 2718 2719 2720 2721 2722 2723
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
	struct rt6_info *grt;
	int err;

	err = 0;
	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
	if (grt) {
2724 2725
		if (!grt->dst.error &&
		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2726 2727
			NL_SET_ERR_MSG(extack,
				       "Nexthop has invalid gateway or device mismatch");
2728 2729 2730 2731 2732 2733 2734 2735 2736
			err = -EINVAL;
		}

		ip6_rt_put(grt);
	}

	return err;
}

2737 2738 2739 2740 2741 2742 2743 2744 2745 2746 2747
static int ip6_route_check_nh(struct net *net,
			      struct fib6_config *cfg,
			      struct net_device **_dev,
			      struct inet6_dev **idev)
{
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	struct net_device *dev = _dev ? *_dev : NULL;
	struct rt6_info *grt = NULL;
	int err = -EHOSTUNREACH;

	if (cfg->fc_table) {
2748 2749 2750 2751
		int flags = RT6_LOOKUP_F_IFACE;

		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
					  cfg->fc_table, flags);
2752 2753 2754 2755 2756 2757 2758 2759 2760 2761
		if (grt) {
			if (grt->rt6i_flags & RTF_GATEWAY ||
			    (dev && dev != grt->dst.dev)) {
				ip6_rt_put(grt);
				grt = NULL;
			}
		}
	}

	if (!grt)
D
David Ahern 已提交
2762
		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2763 2764 2765 2766 2767 2768 2769 2770 2771 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 2787

	if (!grt)
		goto out;

	if (dev) {
		if (dev != grt->dst.dev) {
			ip6_rt_put(grt);
			goto out;
		}
	} else {
		*_dev = dev = grt->dst.dev;
		*idev = grt->rt6i_idev;
		dev_hold(dev);
		in6_dev_hold(grt->rt6i_idev);
	}

	if (!(grt->rt6i_flags & RTF_GATEWAY))
		err = 0;

	ip6_rt_put(grt);

out:
	return err;
}

2788 2789 2790 2791 2792 2793
static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
			   struct net_device **_dev, struct inet6_dev **idev,
			   struct netlink_ext_ack *extack)
{
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	int gwa_type = ipv6_addr_type(gw_addr);
2794
	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2795
	const struct net_device *dev = *_dev;
2796
	bool need_addr_check = !dev;
2797 2798 2799 2800 2801 2802 2803
	int err = -EINVAL;

	/* if gw_addr is local we will fail to detect this in case
	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
	 * will return already-added prefix route via interface that
	 * prefix route was assigned to, which might be non-loopback.
	 */
2804 2805 2806
	if (dev &&
	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2807 2808 2809 2810 2811 2812 2813 2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824 2825 2826 2827 2828 2829 2830 2831 2832 2833 2834 2835 2836 2837 2838 2839 2840 2841 2842 2843 2844 2845
		goto out;
	}

	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
		/* IPv6 strictly inhibits using not link-local
		 * addresses as nexthop address.
		 * Otherwise, router will not able to send redirects.
		 * It is very good, but in some (rare!) circumstances
		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
		 * some exceptions. --ANK
		 * We allow IPv4-mapped nexthops to support RFC4798-type
		 * addressing
		 */
		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
			NL_SET_ERR_MSG(extack, "Invalid gateway address");
			goto out;
		}

		if (cfg->fc_flags & RTNH_F_ONLINK)
			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
		else
			err = ip6_route_check_nh(net, cfg, _dev, idev);

		if (err)
			goto out;
	}

	/* reload in case device was changed */
	dev = *_dev;

	err = -EINVAL;
	if (!dev) {
		NL_SET_ERR_MSG(extack, "Egress device not specified");
		goto out;
	} else if (dev->flags & IFF_LOOPBACK) {
		NL_SET_ERR_MSG(extack,
			       "Egress device can not be loopback device for this route");
		goto out;
	}
2846 2847 2848 2849 2850 2851 2852 2853 2854 2855

	/* if we did not check gw_addr above, do so now that the
	 * egress device has been resolved.
	 */
	if (need_addr_check &&
	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
		goto out;
	}

2856 2857 2858 2859 2860
	err = 0;
out:
	return err;
}

2861
static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2862
					      gfp_t gfp_flags,
2863
					      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
2864
{
2865
	struct net *net = cfg->fc_nlinfo.nl_net;
2866
	struct fib6_info *rt = NULL;
L
Linus Torvalds 已提交
2867 2868
	struct net_device *dev = NULL;
	struct inet6_dev *idev = NULL;
T
Thomas Graf 已提交
2869
	struct fib6_table *table;
L
Linus Torvalds 已提交
2870
	int addr_type;
2871
	int err = -EINVAL;
L
Linus Torvalds 已提交
2872

2873
	/* RTF_PCPU is an internal flag; can not be set by userspace */
2874 2875
	if (cfg->fc_flags & RTF_PCPU) {
		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2876
		goto out;
2877
	}
2878

2879 2880 2881 2882 2883 2884
	/* RTF_CACHE is an internal flag; can not be set by userspace */
	if (cfg->fc_flags & RTF_CACHE) {
		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
		goto out;
	}

2885 2886 2887 2888 2889
	if (cfg->fc_type > RTN_MAX) {
		NL_SET_ERR_MSG(extack, "Invalid route type");
		goto out;
	}

2890 2891 2892 2893 2894 2895
	if (cfg->fc_dst_len > 128) {
		NL_SET_ERR_MSG(extack, "Invalid prefix length");
		goto out;
	}
	if (cfg->fc_src_len > 128) {
		NL_SET_ERR_MSG(extack, "Invalid source address length");
2896
		goto out;
2897
	}
L
Linus Torvalds 已提交
2898
#ifndef CONFIG_IPV6_SUBTREES
2899 2900 2901
	if (cfg->fc_src_len) {
		NL_SET_ERR_MSG(extack,
			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2902
		goto out;
2903
	}
L
Linus Torvalds 已提交
2904
#endif
2905
	if (cfg->fc_ifindex) {
L
Linus Torvalds 已提交
2906
		err = -ENODEV;
2907
		dev = dev_get_by_index(net, cfg->fc_ifindex);
L
Linus Torvalds 已提交
2908 2909 2910 2911 2912 2913 2914
		if (!dev)
			goto out;
		idev = in6_dev_get(dev);
		if (!idev)
			goto out;
	}

2915 2916
	if (cfg->fc_metric == 0)
		cfg->fc_metric = IP6_RT_PRIO_USER;
L
Linus Torvalds 已提交
2917

2918 2919 2920 2921 2922 2923 2924 2925 2926 2927 2928 2929 2930 2931 2932
	if (cfg->fc_flags & RTNH_F_ONLINK) {
		if (!dev) {
			NL_SET_ERR_MSG(extack,
				       "Nexthop device required for onlink");
			err = -ENODEV;
			goto out;
		}

		if (!(dev->flags & IFF_UP)) {
			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
			err = -ENETDOWN;
			goto out;
		}
	}

2933
	err = -ENOBUFS;
2934 2935
	if (cfg->fc_nlinfo.nlh &&
	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
2936
		table = fib6_get_table(net, cfg->fc_table);
2937
		if (!table) {
2938
			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
2939 2940 2941 2942 2943
			table = fib6_new_table(net, cfg->fc_table);
		}
	} else {
		table = fib6_new_table(net, cfg->fc_table);
	}
2944 2945

	if (!table)
T
Thomas Graf 已提交
2946 2947
		goto out;

2948 2949 2950
	err = -ENOMEM;
	rt = fib6_info_alloc(gfp_flags);
	if (!rt)
L
Linus Torvalds 已提交
2951
		goto out;
2952 2953 2954

	if (cfg->fc_flags & RTF_ADDRCONF)
		rt->dst_nocount = true;
L
Linus Torvalds 已提交
2955

2956 2957 2958 2959
	err = ip6_convert_metrics(net, rt, cfg);
	if (err < 0)
		goto out;

2960
	if (cfg->fc_flags & RTF_EXPIRES)
2961
		fib6_set_expires(rt, jiffies +
2962 2963
				clock_t_to_jiffies(cfg->fc_expires));
	else
2964
		fib6_clean_expires(rt);
L
Linus Torvalds 已提交
2965

2966 2967
	if (cfg->fc_protocol == RTPROT_UNSPEC)
		cfg->fc_protocol = RTPROT_BOOT;
2968
	rt->fib6_protocol = cfg->fc_protocol;
2969 2970

	addr_type = ipv6_addr_type(&cfg->fc_dst);
L
Linus Torvalds 已提交
2971

2972 2973 2974
	if (cfg->fc_encap) {
		struct lwtunnel_state *lwtstate;

2975
		err = lwtunnel_build_state(cfg->fc_encap_type,
2976
					   cfg->fc_encap, AF_INET6, cfg,
2977
					   &lwtstate, extack);
2978 2979
		if (err)
			goto out;
2980
		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
2981 2982
	}

2983 2984 2985
	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
	rt->fib6_dst.plen = cfg->fc_dst_len;
	if (rt->fib6_dst.plen == 128)
2986
		rt->dst_host = true;
2987

L
Linus Torvalds 已提交
2988
#ifdef CONFIG_IPV6_SUBTREES
2989 2990
	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
	rt->fib6_src.plen = cfg->fc_src_len;
L
Linus Torvalds 已提交
2991 2992
#endif

2993
	rt->fib6_metric = cfg->fc_metric;
2994
	rt->fib6_nh.nh_weight = 1;
L
Linus Torvalds 已提交
2995

2996 2997
	rt->fib6_type = cfg->fc_type;

L
Linus Torvalds 已提交
2998 2999 3000
	/* We cannot add true routes via loopback here,
	   they would result in kernel looping; promote them to reject routes
	 */
3001
	if ((cfg->fc_flags & RTF_REJECT) ||
3002 3003 3004
	    (dev && (dev->flags & IFF_LOOPBACK) &&
	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
	     !(cfg->fc_flags & RTF_LOCAL))) {
L
Linus Torvalds 已提交
3005
		/* hold loopback dev/idev if we haven't done so. */
3006
		if (dev != net->loopback_dev) {
L
Linus Torvalds 已提交
3007 3008 3009 3010
			if (dev) {
				dev_put(dev);
				in6_dev_put(idev);
			}
3011
			dev = net->loopback_dev;
L
Linus Torvalds 已提交
3012 3013 3014 3015 3016 3017 3018
			dev_hold(dev);
			idev = in6_dev_get(dev);
			if (!idev) {
				err = -ENODEV;
				goto out;
			}
		}
3019
		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
L
Linus Torvalds 已提交
3020 3021 3022
		goto install_route;
	}

3023
	if (cfg->fc_flags & RTF_GATEWAY) {
3024 3025
		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
		if (err)
3026
			goto out;
L
Linus Torvalds 已提交
3027

3028
		rt->fib6_nh.nh_gw = cfg->fc_gateway;
L
Linus Torvalds 已提交
3029 3030 3031
	}

	err = -ENODEV;
3032
	if (!dev)
L
Linus Torvalds 已提交
3033 3034
		goto out;

3035 3036 3037 3038 3039 3040
	if (idev->cnf.disable_ipv6) {
		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
		err = -EACCES;
		goto out;
	}

3041 3042 3043 3044 3045 3046
	if (!(dev->flags & IFF_UP)) {
		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
		err = -ENETDOWN;
		goto out;
	}

3047 3048
	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3049
			NL_SET_ERR_MSG(extack, "Invalid source address");
3050 3051 3052
			err = -EINVAL;
			goto out;
		}
3053 3054
		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
		rt->fib6_prefsrc.plen = 128;
3055
	} else
3056
		rt->fib6_prefsrc.plen = 0;
3057

3058
	rt->fib6_flags = cfg->fc_flags;
L
Linus Torvalds 已提交
3059 3060

install_route:
3061
	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3062
	    !netif_carrier_ok(dev))
3063 3064
		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3065
	rt->fib6_nh.nh_dev = dev;
3066
	rt->fib6_table = table;
3067

3068
	cfg->fc_nlinfo.nl_net = dev_net(dev);
3069

D
David Ahern 已提交
3070 3071 3072
	if (idev)
		in6_dev_put(idev);

3073
	return rt;
3074 3075 3076 3077 3078 3079
out:
	if (dev)
		dev_put(dev);
	if (idev)
		in6_dev_put(idev);

3080
	fib6_info_release(rt);
3081
	return ERR_PTR(err);
3082 3083
}

3084 3085
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
		  struct netlink_ext_ack *extack)
3086
{
3087
	struct fib6_info *rt;
3088 3089
	int err;

3090
	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3091 3092
	if (IS_ERR(rt))
		return PTR_ERR(rt);
3093

3094
	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3095
	fib6_info_release(rt);
3096

L
Linus Torvalds 已提交
3097 3098 3099
	return err;
}

3100
static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
3101
{
3102
	struct net *net = info->nl_net;
T
Thomas Graf 已提交
3103
	struct fib6_table *table;
3104
	int err;
L
Linus Torvalds 已提交
3105

D
David Ahern 已提交
3106
	if (rt == net->ipv6.fib6_null_entry) {
3107 3108 3109
		err = -ENOENT;
		goto out;
	}
3110

3111
	table = rt->fib6_table;
3112
	spin_lock_bh(&table->tb6_lock);
3113
	err = fib6_del(rt, info);
3114
	spin_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
3115

3116
out:
3117
	fib6_info_release(rt);
L
Linus Torvalds 已提交
3118 3119 3120
	return err;
}

3121
int ip6_del_rt(struct net *net, struct fib6_info *rt)
3122
{
3123 3124
	struct nl_info info = { .nl_net = net };

3125
	return __ip6_del_rt(rt, &info);
3126 3127
}

3128
static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3129 3130
{
	struct nl_info *info = &cfg->fc_nlinfo;
3131
	struct net *net = info->nl_net;
3132
	struct sk_buff *skb = NULL;
3133
	struct fib6_table *table;
3134
	int err = -ENOENT;
3135

D
David Ahern 已提交
3136
	if (rt == net->ipv6.fib6_null_entry)
3137
		goto out_put;
3138
	table = rt->fib6_table;
3139
	spin_lock_bh(&table->tb6_lock);
3140

3141
	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3142
		struct fib6_info *sibling, *next_sibling;
3143

3144 3145 3146 3147 3148
		/* prefer to send a single notification with all hops */
		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
		if (skb) {
			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

3149
			if (rt6_fill_node(net, skb, rt, NULL,
3150 3151 3152 3153 3154 3155 3156 3157
					  NULL, NULL, 0, RTM_DELROUTE,
					  info->portid, seq, 0) < 0) {
				kfree_skb(skb);
				skb = NULL;
			} else
				info->skip_notify = 1;
		}

3158
		list_for_each_entry_safe(sibling, next_sibling,
3159 3160
					 &rt->fib6_siblings,
					 fib6_siblings) {
3161 3162
			err = fib6_del(sibling, info);
			if (err)
3163
				goto out_unlock;
3164 3165 3166 3167
		}
	}

	err = fib6_del(rt, info);
3168
out_unlock:
3169
	spin_unlock_bh(&table->tb6_lock);
3170
out_put:
3171
	fib6_info_release(rt);
3172 3173

	if (skb) {
3174
		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3175 3176
			    info->nlh, gfp_any());
	}
3177 3178 3179
	return err;
}

3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195
static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
	int rc = -ESRCH;

	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
		goto out;

	if (cfg->fc_flags & RTF_GATEWAY &&
	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
		goto out;
	if (dst_hold_safe(&rt->dst))
		rc = rt6_remove_exception_rt(rt);
out:
	return rc;
}

3196 3197
static int ip6_route_del(struct fib6_config *cfg,
			 struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
3198
{
3199
	struct rt6_info *rt_cache;
T
Thomas Graf 已提交
3200
	struct fib6_table *table;
3201
	struct fib6_info *rt;
L
Linus Torvalds 已提交
3202 3203 3204
	struct fib6_node *fn;
	int err = -ESRCH;

3205
	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3206 3207
	if (!table) {
		NL_SET_ERR_MSG(extack, "FIB table does not exist");
T
Thomas Graf 已提交
3208
		return err;
3209
	}
T
Thomas Graf 已提交
3210

3211
	rcu_read_lock();
L
Linus Torvalds 已提交
3212

T
Thomas Graf 已提交
3213
	fn = fib6_locate(&table->tb6_root,
3214
			 &cfg->fc_dst, cfg->fc_dst_len,
3215
			 &cfg->fc_src, cfg->fc_src_len,
3216
			 !(cfg->fc_flags & RTF_CACHE));
3217

L
Linus Torvalds 已提交
3218
	if (fn) {
3219
		for_each_fib6_node_rt_rcu(fn) {
3220
			if (cfg->fc_flags & RTF_CACHE) {
3221 3222
				int rc;

3223 3224
				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
							      &cfg->fc_src);
3225 3226 3227 3228 3229 3230
				if (rt_cache) {
					rc = ip6_del_cached_rt(rt_cache, cfg);
					if (rc != -ESRCH)
						return rc;
				}
				continue;
3231
			}
3232
			if (cfg->fc_ifindex &&
3233 3234
			    (!rt->fib6_nh.nh_dev ||
			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
L
Linus Torvalds 已提交
3235
				continue;
3236
			if (cfg->fc_flags & RTF_GATEWAY &&
3237
			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
L
Linus Torvalds 已提交
3238
				continue;
3239
			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
L
Linus Torvalds 已提交
3240
				continue;
3241
			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3242
				continue;
3243
			fib6_info_hold(rt);
3244
			rcu_read_unlock();
L
Linus Torvalds 已提交
3245

3246 3247 3248 3249 3250
			/* if gateway was specified only delete the one hop */
			if (cfg->fc_flags & RTF_GATEWAY)
				return __ip6_del_rt(rt, &cfg->fc_nlinfo);

			return __ip6_del_rt_siblings(rt, cfg);
L
Linus Torvalds 已提交
3251 3252
		}
	}
3253
	rcu_read_unlock();
L
Linus Torvalds 已提交
3254 3255 3256 3257

	return err;
}

3258
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3259 3260
{
	struct netevent_redirect netevent;
3261 3262 3263 3264
	struct rt6_info *rt, *nrt = NULL;
	struct ndisc_options ndopts;
	struct inet6_dev *in6_dev;
	struct neighbour *neigh;
3265
	struct fib6_info *from;
3266
	struct rd_msg *msg;
3267 3268
	int optlen, on_link;
	u8 *lladdr;
3269

3270
	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3271
	optlen -= sizeof(*msg);
3272 3273

	if (optlen < 0) {
3274
		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3275 3276 3277
		return;
	}

3278
	msg = (struct rd_msg *)icmp6_hdr(skb);
3279

3280
	if (ipv6_addr_is_multicast(&msg->dest)) {
3281
		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3282 3283 3284
		return;
	}

3285
	on_link = 0;
3286
	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3287
		on_link = 1;
3288
	} else if (ipv6_addr_type(&msg->target) !=
3289
		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3290
		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304
		return;
	}

	in6_dev = __in6_dev_get(skb->dev);
	if (!in6_dev)
		return;
	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
		return;

	/* RFC2461 8.1:
	 *	The IP source address of the Redirect MUST be the same as the current
	 *	first-hop router for the specified ICMP Destination Address.
	 */

3305
	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3306 3307 3308
		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
		return;
	}
3309 3310

	lladdr = NULL;
3311 3312 3313 3314 3315 3316 3317 3318 3319
	if (ndopts.nd_opts_tgt_lladdr) {
		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
					     skb->dev);
		if (!lladdr) {
			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
			return;
		}
	}

3320
	rt = (struct rt6_info *) dst;
3321
	if (rt->rt6i_flags & RTF_REJECT) {
3322
		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3323
		return;
3324
	}
3325

3326 3327 3328 3329
	/* Redirect received -> path was valid.
	 * Look, redirects are sent only in response to data packets,
	 * so that this nexthop apparently is reachable. --ANK
	 */
3330
	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3331

3332
	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3333 3334
	if (!neigh)
		return;
3335

L
Linus Torvalds 已提交
3336 3337 3338 3339
	/*
	 *	We have finally decided to accept it.
	 */

3340
	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
L
Linus Torvalds 已提交
3341 3342 3343
		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
		     NEIGH_UPDATE_F_OVERRIDE|
		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3344 3345
				     NEIGH_UPDATE_F_ISROUTER)),
		     NDISC_REDIRECT, &ndopts);
L
Linus Torvalds 已提交
3346

3347
	rcu_read_lock();
3348
	from = rcu_dereference(rt->from);
3349
	fib6_info_hold(from);
3350
	rcu_read_unlock();
3351 3352

	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3353
	if (!nrt)
L
Linus Torvalds 已提交
3354 3355 3356 3357 3358 3359
		goto out;

	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
	if (on_link)
		nrt->rt6i_flags &= ~RTF_GATEWAY;

A
Alexey Dobriyan 已提交
3360
	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
L
Linus Torvalds 已提交
3361

3362 3363 3364 3365
	/* No need to remove rt from the exception table if rt is
	 * a cached route because rt6_insert_exception() will
	 * takes care of it
	 */
3366
	if (rt6_insert_exception(nrt, from)) {
3367 3368 3369
		dst_release_immediate(&nrt->dst);
		goto out;
	}
L
Linus Torvalds 已提交
3370

3371 3372
	netevent.old = &rt->dst;
	netevent.new = &nrt->dst;
3373
	netevent.daddr = &msg->dest;
3374
	netevent.neigh = neigh;
3375 3376
	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

L
Linus Torvalds 已提交
3377
out:
3378
	fib6_info_release(from);
3379
	neigh_release(neigh);
3380 3381
}

3382
#ifdef CONFIG_IPV6_ROUTE_INFO
3383
static struct fib6_info *rt6_get_route_info(struct net *net,
3384
					   const struct in6_addr *prefix, int prefixlen,
3385 3386
					   const struct in6_addr *gwaddr,
					   struct net_device *dev)
3387
{
3388 3389
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
	int ifindex = dev->ifindex;
3390
	struct fib6_node *fn;
3391
	struct fib6_info *rt = NULL;
T
Thomas Graf 已提交
3392 3393
	struct fib6_table *table;

3394
	table = fib6_get_table(net, tb_id);
3395
	if (!table)
T
Thomas Graf 已提交
3396
		return NULL;
3397

3398
	rcu_read_lock();
3399
	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3400 3401 3402
	if (!fn)
		goto out;

3403
	for_each_fib6_node_rt_rcu(fn) {
3404
		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3405
			continue;
3406
		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3407
			continue;
3408
		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3409
			continue;
3410
		fib6_info_hold(rt);
3411 3412 3413
		break;
	}
out:
3414
	rcu_read_unlock();
3415 3416 3417
	return rt;
}

3418
static struct fib6_info *rt6_add_route_info(struct net *net,
3419
					   const struct in6_addr *prefix, int prefixlen,
3420 3421
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
3422
					   unsigned int pref)
3423
{
3424
	struct fib6_config cfg = {
3425
		.fc_metric	= IP6_RT_PRIO_USER,
3426
		.fc_ifindex	= dev->ifindex,
3427 3428 3429
		.fc_dst_len	= prefixlen,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
				  RTF_UP | RTF_PREF(pref),
3430
		.fc_protocol = RTPROT_RA,
3431
		.fc_type = RTN_UNICAST,
3432
		.fc_nlinfo.portid = 0,
3433 3434
		.fc_nlinfo.nlh = NULL,
		.fc_nlinfo.nl_net = net,
3435 3436
	};

3437
	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
A
Alexey Dobriyan 已提交
3438 3439
	cfg.fc_dst = *prefix;
	cfg.fc_gateway = *gwaddr;
3440

3441 3442
	/* We should treat it as a default route if prefix length is 0. */
	if (!prefixlen)
3443
		cfg.fc_flags |= RTF_DEFAULT;
3444

3445
	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3446

3447
	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3448 3449 3450
}
#endif

3451
struct fib6_info *rt6_get_dflt_router(struct net *net,
3452 3453
				     const struct in6_addr *addr,
				     struct net_device *dev)
3454
{
3455
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3456
	struct fib6_info *rt;
T
Thomas Graf 已提交
3457
	struct fib6_table *table;
L
Linus Torvalds 已提交
3458

3459
	table = fib6_get_table(net, tb_id);
3460
	if (!table)
T
Thomas Graf 已提交
3461
		return NULL;
L
Linus Torvalds 已提交
3462

3463 3464
	rcu_read_lock();
	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3465
		if (dev == rt->fib6_nh.nh_dev &&
3466
		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3467
		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
L
Linus Torvalds 已提交
3468 3469 3470
			break;
	}
	if (rt)
3471
		fib6_info_hold(rt);
3472
	rcu_read_unlock();
L
Linus Torvalds 已提交
3473 3474 3475
	return rt;
}

3476
struct fib6_info *rt6_add_dflt_router(struct net *net,
3477
				     const struct in6_addr *gwaddr,
3478 3479
				     struct net_device *dev,
				     unsigned int pref)
L
Linus Torvalds 已提交
3480
{
3481
	struct fib6_config cfg = {
D
David Ahern 已提交
3482
		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3483
		.fc_metric	= IP6_RT_PRIO_USER,
3484 3485 3486
		.fc_ifindex	= dev->ifindex,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3487
		.fc_protocol = RTPROT_RA,
3488
		.fc_type = RTN_UNICAST,
3489
		.fc_nlinfo.portid = 0,
3490
		.fc_nlinfo.nlh = NULL,
3491
		.fc_nlinfo.nl_net = net,
3492
	};
L
Linus Torvalds 已提交
3493

A
Alexey Dobriyan 已提交
3494
	cfg.fc_gateway = *gwaddr;
L
Linus Torvalds 已提交
3495

3496
	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3497 3498 3499 3500 3501 3502
		struct fib6_table *table;

		table = fib6_get_table(dev_net(dev), cfg.fc_table);
		if (table)
			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
	}
L
Linus Torvalds 已提交
3503

3504
	return rt6_get_dflt_router(net, gwaddr, dev);
L
Linus Torvalds 已提交
3505 3506
}

3507 3508
static void __rt6_purge_dflt_routers(struct net *net,
				     struct fib6_table *table)
L
Linus Torvalds 已提交
3509
{
3510
	struct fib6_info *rt;
L
Linus Torvalds 已提交
3511 3512

restart:
3513 3514
	rcu_read_lock();
	for_each_fib6_node_rt_rcu(&table->tb6_root) {
D
David Ahern 已提交
3515 3516 3517
		struct net_device *dev = fib6_info_nh_dev(rt);
		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;

3518
		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
D
David Ahern 已提交
3519
		    (!idev || idev->cnf.accept_ra != 2)) {
3520 3521 3522
			fib6_info_hold(rt);
			rcu_read_unlock();
			ip6_del_rt(net, rt);
L
Linus Torvalds 已提交
3523 3524 3525
			goto restart;
		}
	}
3526
	rcu_read_unlock();
3527 3528 3529 3530 3531 3532 3533 3534 3535 3536 3537 3538 3539 3540 3541 3542

	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}

void rt6_purge_dflt_routers(struct net *net)
{
	struct fib6_table *table;
	struct hlist_head *head;
	unsigned int h;

	rcu_read_lock();

	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
		head = &net->ipv6.fib_table_hash[h];
		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3543
				__rt6_purge_dflt_routers(net, table);
3544 3545 3546 3547
		}
	}

	rcu_read_unlock();
L
Linus Torvalds 已提交
3548 3549
}

3550 3551
static void rtmsg_to_fib6_config(struct net *net,
				 struct in6_rtmsg *rtmsg,
3552 3553 3554 3555
				 struct fib6_config *cfg)
{
	memset(cfg, 0, sizeof(*cfg));

D
David Ahern 已提交
3556 3557
	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
			 : RT6_TABLE_MAIN;
3558 3559 3560 3561 3562 3563
	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
	cfg->fc_metric = rtmsg->rtmsg_metric;
	cfg->fc_expires = rtmsg->rtmsg_info;
	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
	cfg->fc_src_len = rtmsg->rtmsg_src_len;
	cfg->fc_flags = rtmsg->rtmsg_flags;
3564
	cfg->fc_type = rtmsg->rtmsg_type;
3565

3566
	cfg->fc_nlinfo.nl_net = net;
3567

A
Alexey Dobriyan 已提交
3568 3569 3570
	cfg->fc_dst = rtmsg->rtmsg_dst;
	cfg->fc_src = rtmsg->rtmsg_src;
	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3571 3572
}

3573
int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
3574
{
3575
	struct fib6_config cfg;
L
Linus Torvalds 已提交
3576 3577 3578
	struct in6_rtmsg rtmsg;
	int err;

3579
	switch (cmd) {
L
Linus Torvalds 已提交
3580 3581
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
3582
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
3583 3584 3585 3586 3587
			return -EPERM;
		err = copy_from_user(&rtmsg, arg,
				     sizeof(struct in6_rtmsg));
		if (err)
			return -EFAULT;
3588

3589
		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3590

L
Linus Torvalds 已提交
3591 3592 3593
		rtnl_lock();
		switch (cmd) {
		case SIOCADDRT:
3594
			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
L
Linus Torvalds 已提交
3595 3596
			break;
		case SIOCDELRT:
3597
			err = ip6_route_del(&cfg, NULL);
L
Linus Torvalds 已提交
3598 3599 3600 3601 3602 3603 3604
			break;
		default:
			err = -EINVAL;
		}
		rtnl_unlock();

		return err;
3605
	}
L
Linus Torvalds 已提交
3606 3607 3608 3609 3610 3611 3612 3613

	return -EINVAL;
}

/*
 *	Drop the packet on the floor
 */

3614
static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
L
Linus Torvalds 已提交
3615
{
3616
	int type;
E
Eric Dumazet 已提交
3617
	struct dst_entry *dst = skb_dst(skb);
3618 3619
	switch (ipstats_mib_noroutes) {
	case IPSTATS_MIB_INNOROUTES:
3620
		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
U
Ulrich Weber 已提交
3621
		if (type == IPV6_ADDR_ANY) {
3622 3623
			IP6_INC_STATS(dev_net(dst->dev),
				      __in6_dev_get_safely(skb->dev),
3624
				      IPSTATS_MIB_INADDRERRORS);
3625 3626 3627 3628
			break;
		}
		/* FALLTHROUGH */
	case IPSTATS_MIB_OUTNOROUTES:
3629 3630
		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
			      ipstats_mib_noroutes);
3631 3632
		break;
	}
3633
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
L
Linus Torvalds 已提交
3634 3635 3636 3637
	kfree_skb(skb);
	return 0;
}

3638 3639
static int ip6_pkt_discard(struct sk_buff *skb)
{
3640
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3641 3642
}

E
Eric W. Biederman 已提交
3643
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
3644
{
E
Eric Dumazet 已提交
3645
	skb->dev = skb_dst(skb)->dev;
3646
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
3647 3648
}

3649 3650
static int ip6_pkt_prohibit(struct sk_buff *skb)
{
3651
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3652 3653
}

E
Eric W. Biederman 已提交
3654
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3655
{
E
Eric Dumazet 已提交
3656
	skb->dev = skb_dst(skb)->dev;
3657
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3658 3659
}

L
Linus Torvalds 已提交
3660 3661 3662 3663
/*
 *	Allocate a dst for local (unicast / anycast) address.
 */

3664 3665 3666 3667
struct fib6_info *addrconf_f6i_alloc(struct net *net,
				     struct inet6_dev *idev,
				     const struct in6_addr *addr,
				     bool anycast, gfp_t gfp_flags)
L
Linus Torvalds 已提交
3668
{
D
David Ahern 已提交
3669
	u32 tb_id;
3670
	struct net_device *dev = idev->dev;
3671
	struct fib6_info *f6i;
3672

3673 3674
	f6i = fib6_info_alloc(gfp_flags);
	if (!f6i)
L
Linus Torvalds 已提交
3675 3676
		return ERR_PTR(-ENOMEM);

3677 3678 3679 3680
	f6i->dst_nocount = true;
	f6i->dst_host = true;
	f6i->fib6_protocol = RTPROT_KERNEL;
	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3681
	if (anycast) {
3682 3683
		f6i->fib6_type = RTN_ANYCAST;
		f6i->fib6_flags |= RTF_ANYCAST;
3684
	} else {
3685 3686
		f6i->fib6_type = RTN_LOCAL;
		f6i->fib6_flags |= RTF_LOCAL;
3687
	}
L
Linus Torvalds 已提交
3688

3689
	f6i->fib6_nh.nh_gw = *addr;
3690
	dev_hold(dev);
3691 3692 3693
	f6i->fib6_nh.nh_dev = dev;
	f6i->fib6_dst.addr = *addr;
	f6i->fib6_dst.plen = 128;
D
David Ahern 已提交
3694
	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3695
	f6i->fib6_table = fib6_get_table(net, tb_id);
L
Linus Torvalds 已提交
3696

3697
	return f6i;
L
Linus Torvalds 已提交
3698 3699
}

3700 3701 3702 3703 3704 3705 3706
/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
	struct net_device *dev;
	struct net *net;
	struct in6_addr *addr;
};

3707
static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3708 3709 3710 3711 3712
{
	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

3713
	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
D
David Ahern 已提交
3714
	    rt != net->ipv6.fib6_null_entry &&
3715
	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3716
		spin_lock_bh(&rt6_exception_lock);
3717
		/* remove prefsrc entry */
3718
		rt->fib6_prefsrc.plen = 0;
3719 3720 3721
		/* need to update cache as well */
		rt6_exceptions_remove_prefsrc(rt);
		spin_unlock_bh(&rt6_exception_lock);
3722 3723 3724 3725 3726 3727 3728 3729 3730 3731 3732 3733
	}
	return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
	struct net *net = dev_net(ifp->idev->dev);
	struct arg_dev_net_ip adni = {
		.dev = ifp->idev->dev,
		.net = net,
		.addr = &ifp->addr,
	};
3734
	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3735 3736
}

3737 3738 3739
#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)

/* Remove routers and update dst entries when gateway turn into host. */
3740
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3741 3742 3743
{
	struct in6_addr *gateway = (struct in6_addr *)arg;

3744
	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3745
	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3746 3747
		return -1;
	}
3748 3749 3750 3751 3752 3753 3754

	/* Further clean up cached routes in exception table.
	 * This is needed because cached route may have a different
	 * gateway than its 'parent' in the case of an ip redirect.
	 */
	rt6_exceptions_clean_tohost(rt, gateway);

3755 3756 3757 3758 3759 3760 3761 3762
	return 0;
}

void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
{
	fib6_clean_all(net, fib6_clean_tohost, gateway);
}

3763 3764
struct arg_netdev_event {
	const struct net_device *dev;
3765 3766 3767 3768
	union {
		unsigned int nh_flags;
		unsigned long event;
	};
3769 3770
};

3771
static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3772
{
3773
	struct fib6_info *iter;
3774 3775
	struct fib6_node *fn;

3776 3777
	fn = rcu_dereference_protected(rt->fib6_node,
			lockdep_is_held(&rt->fib6_table->tb6_lock));
3778
	iter = rcu_dereference_protected(fn->leaf,
3779
			lockdep_is_held(&rt->fib6_table->tb6_lock));
3780
	while (iter) {
3781
		if (iter->fib6_metric == rt->fib6_metric &&
3782 3783
		    rt6_qualify_for_ecmp(iter))
			return iter;
3784
		iter = rcu_dereference_protected(iter->fib6_next,
3785
				lockdep_is_held(&rt->fib6_table->tb6_lock));
3786 3787 3788 3789 3790
	}

	return NULL;
}

3791
static bool rt6_is_dead(const struct fib6_info *rt)
3792
{
3793 3794
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
D
David Ahern 已提交
3795
	     fib6_ignore_linkdown(rt)))
3796 3797 3798 3799 3800
		return true;

	return false;
}

3801
static int rt6_multipath_total_weight(const struct fib6_info *rt)
3802
{
3803
	struct fib6_info *iter;
3804 3805 3806
	int total = 0;

	if (!rt6_is_dead(rt))
3807
		total += rt->fib6_nh.nh_weight;
3808

3809
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3810
		if (!rt6_is_dead(iter))
3811
			total += iter->fib6_nh.nh_weight;
3812 3813 3814 3815 3816
	}

	return total;
}

3817
static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3818 3819 3820 3821
{
	int upper_bound = -1;

	if (!rt6_is_dead(rt)) {
3822
		*weight += rt->fib6_nh.nh_weight;
3823 3824 3825
		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
						    total) - 1;
	}
3826
	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3827 3828
}

3829
static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3830
{
3831
	struct fib6_info *iter;
3832 3833 3834 3835
	int weight = 0;

	rt6_upper_bound_set(rt, &weight, total);

3836
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3837 3838 3839
		rt6_upper_bound_set(iter, &weight, total);
}

3840
void rt6_multipath_rebalance(struct fib6_info *rt)
3841
{
3842
	struct fib6_info *first;
3843 3844 3845 3846 3847 3848
	int total;

	/* In case the entire multipath route was marked for flushing,
	 * then there is no need to rebalance upon the removal of every
	 * sibling route.
	 */
3849
	if (!rt->fib6_nsiblings || rt->should_flush)
3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863
		return;

	/* During lookup routes are evaluated in order, so we need to
	 * make sure upper bounds are assigned from the first sibling
	 * onwards.
	 */
	first = rt6_multipath_first_sibling(rt);
	if (WARN_ON_ONCE(!first))
		return;

	total = rt6_multipath_total_weight(first);
	rt6_multipath_upper_bound_set(first, total);
}

3864
static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3865 3866
{
	const struct arg_netdev_event *arg = p_arg;
3867
	struct net *net = dev_net(arg->dev);
3868

D
David Ahern 已提交
3869
	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3870
		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3871
		fib6_update_sernum_upto_root(net, rt);
3872
		rt6_multipath_rebalance(rt);
3873
	}
3874 3875 3876 3877 3878 3879 3880 3881

	return 0;
}

void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
{
	struct arg_netdev_event arg = {
		.dev = dev,
I
Ido Schimmel 已提交
3882 3883 3884
		{
			.nh_flags = nh_flags,
		},
3885 3886 3887 3888 3889 3890 3891 3892
	};

	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
		arg.nh_flags |= RTNH_F_LINKDOWN;

	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}

3893
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3894 3895
				   const struct net_device *dev)
{
3896
	struct fib6_info *iter;
3897

3898
	if (rt->fib6_nh.nh_dev == dev)
3899
		return true;
3900
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3901
		if (iter->fib6_nh.nh_dev == dev)
3902 3903 3904 3905 3906
			return true;

	return false;
}

3907
static void rt6_multipath_flush(struct fib6_info *rt)
3908
{
3909
	struct fib6_info *iter;
3910 3911

	rt->should_flush = 1;
3912
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3913 3914 3915
		iter->should_flush = 1;
}

3916
static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
3917 3918
					     const struct net_device *down_dev)
{
3919
	struct fib6_info *iter;
3920 3921
	unsigned int dead = 0;

3922 3923
	if (rt->fib6_nh.nh_dev == down_dev ||
	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
3924
		dead++;
3925
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3926 3927
		if (iter->fib6_nh.nh_dev == down_dev ||
		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
3928 3929 3930 3931 3932
			dead++;

	return dead;
}

3933
static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
3934 3935 3936
				       const struct net_device *dev,
				       unsigned int nh_flags)
{
3937
	struct fib6_info *iter;
3938

3939 3940
	if (rt->fib6_nh.nh_dev == dev)
		rt->fib6_nh.nh_flags |= nh_flags;
3941
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3942 3943
		if (iter->fib6_nh.nh_dev == dev)
			iter->fib6_nh.nh_flags |= nh_flags;
3944 3945
}

3946
/* called with write lock held for table with rt */
3947
static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
3948
{
3949 3950
	const struct arg_netdev_event *arg = p_arg;
	const struct net_device *dev = arg->dev;
3951
	struct net *net = dev_net(dev);
3952

D
David Ahern 已提交
3953
	if (rt == net->ipv6.fib6_null_entry)
3954 3955 3956 3957
		return 0;

	switch (arg->event) {
	case NETDEV_UNREGISTER:
3958
		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3959
	case NETDEV_DOWN:
3960
		if (rt->should_flush)
3961
			return -1;
3962
		if (!rt->fib6_nsiblings)
3963
			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
3964 3965 3966 3967
		if (rt6_multipath_uses_dev(rt, dev)) {
			unsigned int count;

			count = rt6_multipath_dead_count(rt, dev);
3968
			if (rt->fib6_nsiblings + 1 == count) {
3969 3970 3971 3972 3973
				rt6_multipath_flush(rt);
				return -1;
			}
			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
						   RTNH_F_LINKDOWN);
3974
			fib6_update_sernum(net, rt);
3975
			rt6_multipath_rebalance(rt);
3976 3977
		}
		return -2;
3978
	case NETDEV_CHANGE:
3979
		if (rt->fib6_nh.nh_dev != dev ||
3980
		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
3981
			break;
3982
		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3983
		rt6_multipath_rebalance(rt);
3984
		break;
3985
	}
3986

L
Linus Torvalds 已提交
3987 3988 3989
	return 0;
}

3990
void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
L
Linus Torvalds 已提交
3991
{
3992
	struct arg_netdev_event arg = {
3993
		.dev = dev,
I
Ido Schimmel 已提交
3994 3995 3996
		{
			.event = event,
		},
3997 3998
	};

3999 4000 4001 4002 4003 4004 4005 4006
	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
}

void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
	rt6_sync_down_dev(dev, event);
	rt6_uncached_list_flush_dev(dev_net(dev), dev);
	neigh_ifdown(&nd_tbl, dev);
L
Linus Torvalds 已提交
4007 4008
}

4009
struct rt6_mtu_change_arg {
L
Linus Torvalds 已提交
4010
	struct net_device *dev;
4011
	unsigned int mtu;
L
Linus Torvalds 已提交
4012 4013
};

4014
static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025
{
	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
	struct inet6_dev *idev;

	/* In IPv6 pmtu discovery is not optional,
	   so that RTAX_MTU lock cannot disable it.
	   We still use this lock to block changes
	   caused by addrconf/ndisc.
	*/

	idev = __in6_dev_get(arg->dev);
4026
	if (!idev)
L
Linus Torvalds 已提交
4027 4028 4029 4030 4031 4032 4033
		return 0;

	/* For administrative MTU increase, there is no way to discover
	   IPv6 PMTU increase, so PMTU increase should be updated here.
	   Since RFC 1981 doesn't include administrative MTU increase
	   update PMTU increase is a MUST. (i.e. jumbo frame)
	 */
4034
	if (rt->fib6_nh.nh_dev == arg->dev &&
4035 4036 4037 4038 4039 4040 4041
	    !fib6_metric_locked(rt, RTAX_MTU)) {
		u32 mtu = rt->fib6_pmtu;

		if (mtu >= arg->mtu ||
		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
			fib6_metric_set(rt, RTAX_MTU, arg->mtu);

4042
		spin_lock_bh(&rt6_exception_lock);
4043
		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4044
		spin_unlock_bh(&rt6_exception_lock);
4045
	}
L
Linus Torvalds 已提交
4046 4047 4048
	return 0;
}

4049
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
L
Linus Torvalds 已提交
4050
{
T
Thomas Graf 已提交
4051 4052 4053 4054
	struct rt6_mtu_change_arg arg = {
		.dev = dev,
		.mtu = mtu,
	};
L
Linus Torvalds 已提交
4055

4056
	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
L
Linus Torvalds 已提交
4057 4058
}

4059
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4060
	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4061
	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4062
	[RTA_OIF]               = { .type = NLA_U32 },
4063
	[RTA_IIF]		= { .type = NLA_U32 },
4064 4065
	[RTA_PRIORITY]          = { .type = NLA_U32 },
	[RTA_METRICS]           = { .type = NLA_NESTED },
4066
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4067
	[RTA_PREF]              = { .type = NLA_U8 },
4068 4069
	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
	[RTA_ENCAP]		= { .type = NLA_NESTED },
4070
	[RTA_EXPIRES]		= { .type = NLA_U32 },
4071
	[RTA_UID]		= { .type = NLA_U32 },
4072
	[RTA_MARK]		= { .type = NLA_U32 },
4073
	[RTA_TABLE]		= { .type = NLA_U32 },
4074 4075 4076
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4077 4078
			      struct fib6_config *cfg,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4079
{
4080 4081
	struct rtmsg *rtm;
	struct nlattr *tb[RTA_MAX+1];
4082
	unsigned int pref;
4083
	int err;
L
Linus Torvalds 已提交
4084

4085 4086
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
			  NULL);
4087 4088
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
4089

4090 4091 4092 4093 4094 4095 4096 4097 4098
	err = -EINVAL;
	rtm = nlmsg_data(nlh);
	memset(cfg, 0, sizeof(*cfg));

	cfg->fc_table = rtm->rtm_table;
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_src_len = rtm->rtm_src_len;
	cfg->fc_flags = RTF_UP;
	cfg->fc_protocol = rtm->rtm_protocol;
4099
	cfg->fc_type = rtm->rtm_type;
4100

4101 4102
	if (rtm->rtm_type == RTN_UNREACHABLE ||
	    rtm->rtm_type == RTN_BLACKHOLE ||
4103 4104
	    rtm->rtm_type == RTN_PROHIBIT ||
	    rtm->rtm_type == RTN_THROW)
4105 4106
		cfg->fc_flags |= RTF_REJECT;

4107 4108 4109
	if (rtm->rtm_type == RTN_LOCAL)
		cfg->fc_flags |= RTF_LOCAL;

4110 4111 4112
	if (rtm->rtm_flags & RTM_F_CLONED)
		cfg->fc_flags |= RTF_CACHE;

4113 4114
	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);

4115
	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4116
	cfg->fc_nlinfo.nlh = nlh;
4117
	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4118 4119

	if (tb[RTA_GATEWAY]) {
4120
		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4121
		cfg->fc_flags |= RTF_GATEWAY;
L
Linus Torvalds 已提交
4122
	}
4123 4124 4125 4126 4127 4128 4129 4130

	if (tb[RTA_DST]) {
		int plen = (rtm->rtm_dst_len + 7) >> 3;

		if (nla_len(tb[RTA_DST]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
L
Linus Torvalds 已提交
4131
	}
4132 4133 4134 4135 4136 4137 4138 4139

	if (tb[RTA_SRC]) {
		int plen = (rtm->rtm_src_len + 7) >> 3;

		if (nla_len(tb[RTA_SRC]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
L
Linus Torvalds 已提交
4140
	}
4141

4142
	if (tb[RTA_PREFSRC])
4143
		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4144

4145 4146 4147 4148 4149 4150 4151 4152 4153
	if (tb[RTA_OIF])
		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

	if (tb[RTA_PRIORITY])
		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

	if (tb[RTA_METRICS]) {
		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
L
Linus Torvalds 已提交
4154
	}
4155 4156 4157 4158

	if (tb[RTA_TABLE])
		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

4159 4160 4161
	if (tb[RTA_MULTIPATH]) {
		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4162 4163

		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4164
						     cfg->fc_mp_len, extack);
4165 4166
		if (err < 0)
			goto errout;
4167 4168
	}

4169 4170 4171 4172 4173 4174 4175 4176
	if (tb[RTA_PREF]) {
		pref = nla_get_u8(tb[RTA_PREF]);
		if (pref != ICMPV6_ROUTER_PREF_LOW &&
		    pref != ICMPV6_ROUTER_PREF_HIGH)
			pref = ICMPV6_ROUTER_PREF_MEDIUM;
		cfg->fc_flags |= RTF_PREF(pref);
	}

4177 4178 4179
	if (tb[RTA_ENCAP])
		cfg->fc_encap = tb[RTA_ENCAP];

4180
	if (tb[RTA_ENCAP_TYPE]) {
4181 4182
		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);

4183
		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4184 4185 4186 4187
		if (err < 0)
			goto errout;
	}

4188 4189 4190 4191 4192 4193 4194 4195 4196
	if (tb[RTA_EXPIRES]) {
		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);

		if (addrconf_finite_timeout(timeout)) {
			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
			cfg->fc_flags |= RTF_EXPIRES;
		}
	}

4197 4198 4199
	err = 0;
errout:
	return err;
L
Linus Torvalds 已提交
4200 4201
}

4202
struct rt6_nh {
4203
	struct fib6_info *fib6_info;
4204 4205 4206 4207 4208 4209 4210 4211 4212
	struct fib6_config r_cfg;
	struct list_head next;
};

static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
{
	struct rt6_nh *nh;

	list_for_each_entry(nh, rt6_nh_list, next) {
4213
		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4214 4215 4216 4217 4218
		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
		        nh->r_cfg.fc_ifindex);
	}
}

4219 4220
static int ip6_route_info_append(struct net *net,
				 struct list_head *rt6_nh_list,
4221 4222
				 struct fib6_info *rt,
				 struct fib6_config *r_cfg)
4223 4224 4225 4226 4227
{
	struct rt6_nh *nh;
	int err = -EEXIST;

	list_for_each_entry(nh, rt6_nh_list, next) {
4228 4229
		/* check if fib6_info already exists */
		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4230 4231 4232 4233 4234 4235
			return err;
	}

	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
	if (!nh)
		return -ENOMEM;
4236
	nh->fib6_info = rt;
4237
	err = ip6_convert_metrics(net, rt, r_cfg);
4238 4239 4240 4241 4242 4243 4244 4245 4246 4247
	if (err) {
		kfree(nh);
		return err;
	}
	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
	list_add_tail(&nh->next, rt6_nh_list);

	return 0;
}

4248 4249
static void ip6_route_mpath_notify(struct fib6_info *rt,
				   struct fib6_info *rt_last,
4250 4251 4252 4253 4254 4255 4256 4257 4258
				   struct nl_info *info,
				   __u16 nlflags)
{
	/* if this is an APPEND route, then rt points to the first route
	 * inserted and rt_last points to last route inserted. Userspace
	 * wants a consistent dump of the route which starts at the first
	 * nexthop. Since sibling routes are always added at the end of
	 * the list, find the first sibling of the last route appended
	 */
4259 4260
	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
		rt = list_first_entry(&rt_last->fib6_siblings,
4261
				      struct fib6_info,
4262
				      fib6_siblings);
4263 4264 4265 4266 4267 4268
	}

	if (rt)
		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
}

4269 4270
static int ip6_route_multipath_add(struct fib6_config *cfg,
				   struct netlink_ext_ack *extack)
4271
{
4272
	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4273
	struct nl_info *info = &cfg->fc_nlinfo;
4274 4275
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
4276
	struct fib6_info *rt;
4277 4278
	struct rt6_nh *err_nh;
	struct rt6_nh *nh, *nh_safe;
4279
	__u16 nlflags;
4280 4281
	int remaining;
	int attrlen;
4282 4283 4284 4285 4286
	int err = 1;
	int nhn = 0;
	int replace = (cfg->fc_nlinfo.nlh &&
		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
	LIST_HEAD(rt6_nh_list);
4287

4288 4289 4290 4291
	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
		nlflags |= NLM_F_APPEND;

4292
	remaining = cfg->fc_mp_len;
4293 4294
	rtnh = (struct rtnexthop *)cfg->fc_mp;

4295
	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4296
	 * fib6_info structs per nexthop
4297
	 */
4298 4299 4300 4301 4302 4303 4304 4305 4306 4307 4308
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
4309
				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4310 4311
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
4312 4313 4314 4315
			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
			if (nla)
				r_cfg.fc_encap_type = nla_get_u16(nla);
4316
		}
4317

4318
		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4319
		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4320 4321 4322
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
			rt = NULL;
4323
			goto cleanup;
4324
		}
4325

4326
		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4327

4328 4329
		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
					    rt, &r_cfg);
4330
		if (err) {
4331
			fib6_info_release(rt);
4332 4333 4334 4335 4336 4337
			goto cleanup;
		}

		rtnh = rtnh_next(rtnh, &remaining);
	}

4338 4339 4340 4341 4342 4343
	/* for add and replace send one notification with all nexthops.
	 * Skip the notification in fib6_add_rt2node and send one with
	 * the full route when done
	 */
	info->skip_notify = 1;

4344 4345
	err_nh = NULL;
	list_for_each_entry(nh, &rt6_nh_list, next) {
4346 4347 4348
		rt_last = nh->fib6_info;
		err = __ip6_ins_rt(nh->fib6_info, info, extack);
		fib6_info_release(nh->fib6_info);
4349

4350 4351
		/* save reference to first route for notification */
		if (!rt_notif && !err)
4352
			rt_notif = nh->fib6_info;
4353

4354 4355
		/* nh->fib6_info is used or freed at this point, reset to NULL*/
		nh->fib6_info = NULL;
4356 4357 4358 4359 4360
		if (err) {
			if (replace && nhn)
				ip6_print_replace_route_err(&rt6_nh_list);
			err_nh = nh;
			goto add_errout;
4361
		}
4362

4363
		/* Because each route is added like a single route we remove
4364 4365 4366 4367 4368
		 * these flags after the first nexthop: if there is a collision,
		 * we have already failed to add the first nexthop:
		 * fib6_add_rt2node() has rejected it; when replacing, old
		 * nexthops have been replaced by first new, the rest should
		 * be added to it.
4369
		 */
4370 4371
		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
						     NLM_F_REPLACE);
4372 4373 4374
		nhn++;
	}

4375 4376
	/* success ... tell user about new route */
	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4377 4378 4379
	goto cleanup;

add_errout:
4380 4381 4382 4383 4384 4385 4386
	/* send notification for routes that were added so that
	 * the delete notifications sent by ip6_route_del are
	 * coherent
	 */
	if (rt_notif)
		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);

4387 4388 4389 4390
	/* Delete routes that were already added */
	list_for_each_entry(nh, &rt6_nh_list, next) {
		if (err_nh == nh)
			break;
4391
		ip6_route_del(&nh->r_cfg, extack);
4392 4393 4394 4395
	}

cleanup:
	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4396 4397
		if (nh->fib6_info)
			fib6_info_release(nh->fib6_info);
4398 4399 4400 4401 4402 4403 4404
		list_del(&nh->next);
		kfree(nh);
	}

	return err;
}

4405 4406
static int ip6_route_multipath_del(struct fib6_config *cfg,
				   struct netlink_ext_ack *extack)
4407 4408 4409 4410 4411 4412 4413 4414 4415 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 4430 4431 4432
{
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
	int remaining;
	int attrlen;
	int err = 1, last_err = 0;

	remaining = cfg->fc_mp_len;
	rtnh = (struct rtnexthop *)cfg->fc_mp;

	/* Parse a Multipath Entry */
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
		}
4433
		err = ip6_route_del(&r_cfg, extack);
4434 4435 4436
		if (err)
			last_err = err;

4437 4438 4439 4440 4441 4442
		rtnh = rtnh_next(rtnh, &remaining);
	}

	return last_err;
}

4443 4444
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4445
{
4446 4447
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
4448

4449
	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4450 4451 4452
	if (err < 0)
		return err;

4453
	if (cfg.fc_mp)
4454
		return ip6_route_multipath_del(&cfg, extack);
4455 4456
	else {
		cfg.fc_delete_all_nh = 1;
4457
		return ip6_route_del(&cfg, extack);
4458
	}
L
Linus Torvalds 已提交
4459 4460
}

4461 4462
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4463
{
4464 4465
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
4466

4467
	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4468 4469 4470
	if (err < 0)
		return err;

4471
	if (cfg.fc_mp)
4472
		return ip6_route_multipath_add(&cfg, extack);
4473
	else
4474
		return ip6_route_add(&cfg, GFP_KERNEL, extack);
L
Linus Torvalds 已提交
4475 4476
}

4477
static size_t rt6_nlmsg_size(struct fib6_info *rt)
4478
{
4479 4480
	int nexthop_len = 0;

4481
	if (rt->fib6_nsiblings) {
4482 4483 4484
		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
			    + NLA_ALIGN(sizeof(struct rtnexthop))
			    + nla_total_size(16) /* RTA_GATEWAY */
4485
			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4486

4487
		nexthop_len *= rt->fib6_nsiblings;
4488 4489
	}

4490 4491 4492 4493 4494 4495 4496 4497 4498
	return NLMSG_ALIGN(sizeof(struct rtmsg))
	       + nla_total_size(16) /* RTA_SRC */
	       + nla_total_size(16) /* RTA_DST */
	       + nla_total_size(16) /* RTA_GATEWAY */
	       + nla_total_size(16) /* RTA_PREFSRC */
	       + nla_total_size(4) /* RTA_TABLE */
	       + nla_total_size(4) /* RTA_IIF */
	       + nla_total_size(4) /* RTA_OIF */
	       + nla_total_size(4) /* RTA_PRIORITY */
4499
	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4500
	       + nla_total_size(sizeof(struct rta_cacheinfo))
4501
	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4502
	       + nla_total_size(1) /* RTA_PREF */
4503
	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4504 4505 4506
	       + nexthop_len;
}

4507
static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4508
			    unsigned int *flags, bool skip_oif)
4509
{
4510
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4511 4512
		*flags |= RTNH_F_DEAD;

4513
	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4514
		*flags |= RTNH_F_LINKDOWN;
D
David Ahern 已提交
4515 4516 4517

		rcu_read_lock();
		if (fib6_ignore_linkdown(rt))
4518
			*flags |= RTNH_F_DEAD;
D
David Ahern 已提交
4519
		rcu_read_unlock();
4520 4521
	}

4522
	if (rt->fib6_flags & RTF_GATEWAY) {
4523
		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4524 4525 4526
			goto nla_put_failure;
	}

4527 4528
	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4529 4530
		*flags |= RTNH_F_OFFLOAD;

4531
	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4532 4533
	if (!skip_oif && rt->fib6_nh.nh_dev &&
	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4534 4535
		goto nla_put_failure;

4536 4537
	if (rt->fib6_nh.nh_lwtstate &&
	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4538 4539 4540 4541 4542 4543 4544 4545
		goto nla_put_failure;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

4546
/* add multipath next hop */
4547
static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4548
{
4549
	const struct net_device *dev = rt->fib6_nh.nh_dev;
4550 4551 4552 4553 4554 4555 4556
	struct rtnexthop *rtnh;
	unsigned int flags = 0;

	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
	if (!rtnh)
		goto nla_put_failure;

4557 4558
	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4559

4560
	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571
		goto nla_put_failure;

	rtnh->rtnh_flags = flags;

	/* length of rtnetlink header + attributes */
	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
4572 4573
}

4574
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4575
			 struct fib6_info *rt, struct dst_entry *dst,
4576
			 struct in6_addr *dest, struct in6_addr *src,
4577
			 int iif, int type, u32 portid, u32 seq,
4578
			 unsigned int flags)
L
Linus Torvalds 已提交
4579 4580
{
	struct rtmsg *rtm;
4581
	struct nlmsghdr *nlh;
4582 4583
	long expires = 0;
	u32 *pmetrics;
4584
	u32 table;
L
Linus Torvalds 已提交
4585

4586
	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4587
	if (!nlh)
4588
		return -EMSGSIZE;
4589 4590

	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
4591
	rtm->rtm_family = AF_INET6;
4592 4593
	rtm->rtm_dst_len = rt->fib6_dst.plen;
	rtm->rtm_src_len = rt->fib6_src.plen;
L
Linus Torvalds 已提交
4594
	rtm->rtm_tos = 0;
4595 4596
	if (rt->fib6_table)
		table = rt->fib6_table->tb6_id;
T
Thomas Graf 已提交
4597
	else
4598 4599
		table = RT6_TABLE_UNSPEC;
	rtm->rtm_table = table;
D
David S. Miller 已提交
4600 4601
	if (nla_put_u32(skb, RTA_TABLE, table))
		goto nla_put_failure;
4602 4603

	rtm->rtm_type = rt->fib6_type;
L
Linus Torvalds 已提交
4604 4605
	rtm->rtm_flags = 0;
	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4606
	rtm->rtm_protocol = rt->fib6_protocol;
L
Linus Torvalds 已提交
4607

4608
	if (rt->fib6_flags & RTF_CACHE)
L
Linus Torvalds 已提交
4609 4610
		rtm->rtm_flags |= RTM_F_CLONED;

4611 4612
	if (dest) {
		if (nla_put_in6_addr(skb, RTA_DST, dest))
D
David S. Miller 已提交
4613
			goto nla_put_failure;
4614
		rtm->rtm_dst_len = 128;
L
Linus Torvalds 已提交
4615
	} else if (rtm->rtm_dst_len)
4616
		if (nla_put_in6_addr(skb, RTA_DST, &rt->fib6_dst.addr))
D
David S. Miller 已提交
4617
			goto nla_put_failure;
L
Linus Torvalds 已提交
4618 4619
#ifdef CONFIG_IPV6_SUBTREES
	if (src) {
4620
		if (nla_put_in6_addr(skb, RTA_SRC, src))
D
David S. Miller 已提交
4621
			goto nla_put_failure;
4622
		rtm->rtm_src_len = 128;
D
David S. Miller 已提交
4623
	} else if (rtm->rtm_src_len &&
4624
		   nla_put_in6_addr(skb, RTA_SRC, &rt->fib6_src.addr))
D
David S. Miller 已提交
4625
		goto nla_put_failure;
L
Linus Torvalds 已提交
4626
#endif
4627 4628
	if (iif) {
#ifdef CONFIG_IPV6_MROUTE
4629
		if (ipv6_addr_is_multicast(&rt->fib6_dst.addr)) {
4630 4631 4632 4633 4634 4635
			int err = ip6mr_get_route(net, skb, rtm, portid);

			if (err == 0)
				return 0;
			if (err < 0)
				goto nla_put_failure;
4636 4637
		} else
#endif
D
David S. Miller 已提交
4638 4639
			if (nla_put_u32(skb, RTA_IIF, iif))
				goto nla_put_failure;
4640
	} else if (dest) {
L
Linus Torvalds 已提交
4641
		struct in6_addr saddr_buf;
4642
		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4643
		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
4644
			goto nla_put_failure;
L
Linus Torvalds 已提交
4645
	}
4646

4647
	if (rt->fib6_prefsrc.plen) {
4648
		struct in6_addr saddr_buf;
4649
		saddr_buf = rt->fib6_prefsrc.addr;
4650
		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
4651
			goto nla_put_failure;
4652 4653
	}

4654 4655
	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4656 4657
		goto nla_put_failure;

4658
	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
D
David S. Miller 已提交
4659
		goto nla_put_failure;
4660

4661 4662 4663
	/* For multipath routes, walk the siblings list and add
	 * each as a nexthop within RTA_MULTIPATH.
	 */
4664
	if (rt->fib6_nsiblings) {
4665
		struct fib6_info *sibling, *next_sibling;
4666 4667 4668 4669 4670 4671 4672 4673 4674 4675
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
		if (!mp)
			goto nla_put_failure;

		if (rt6_add_nexthop(skb, rt) < 0)
			goto nla_put_failure;

		list_for_each_entry_safe(sibling, next_sibling,
4676
					 &rt->fib6_siblings, fib6_siblings) {
4677 4678 4679 4680 4681 4682
			if (rt6_add_nexthop(skb, sibling) < 0)
				goto nla_put_failure;
		}

		nla_nest_end(skb, mp);
	} else {
4683
		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4684 4685 4686
			goto nla_put_failure;
	}

4687
	if (rt->fib6_flags & RTF_EXPIRES) {
4688 4689 4690
		expires = dst ? dst->expires : rt->expires;
		expires -= jiffies;
	}
4691

4692
	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4693
		goto nla_put_failure;
4694

4695
	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->fib6_flags)))
4696 4697
		goto nla_put_failure;

4698

4699 4700
	nlmsg_end(skb, nlh);
	return 0;
4701 4702

nla_put_failure:
4703 4704
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
4705 4706
}

4707
int rt6_dump_route(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4708 4709
{
	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4710 4711
	struct net *net = arg->net;

D
David Ahern 已提交
4712
	if (rt == net->ipv6.fib6_null_entry)
4713
		return 0;
L
Linus Torvalds 已提交
4714

4715 4716
	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4717 4718 4719

		/* user wants prefix routes only */
		if (rtm->rtm_flags & RTM_F_PREFIX &&
4720
		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4721 4722 4723 4724
			/* success since this is not a prefix route */
			return 1;
		}
	}
L
Linus Torvalds 已提交
4725

4726 4727 4728
	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
L
Linus Torvalds 已提交
4729 4730
}

4731 4732
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4733
{
4734
	struct net *net = sock_net(in_skb->sk);
4735
	struct nlattr *tb[RTA_MAX+1];
4736
	int err, iif = 0, oif = 0;
4737
	struct fib6_info *from;
4738
	struct dst_entry *dst;
4739
	struct rt6_info *rt;
L
Linus Torvalds 已提交
4740
	struct sk_buff *skb;
4741
	struct rtmsg *rtm;
4742
	struct flowi6 fl6;
4743
	bool fibmatch;
L
Linus Torvalds 已提交
4744

4745
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4746
			  extack);
4747 4748
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
4749

4750
	err = -EINVAL;
4751
	memset(&fl6, 0, sizeof(fl6));
4752 4753
	rtm = nlmsg_data(nlh);
	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4754
	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
L
Linus Torvalds 已提交
4755

4756 4757 4758 4759
	if (tb[RTA_SRC]) {
		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
4760
		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4761 4762 4763 4764 4765 4766
	}

	if (tb[RTA_DST]) {
		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
4767
		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4768 4769 4770 4771 4772 4773
	}

	if (tb[RTA_IIF])
		iif = nla_get_u32(tb[RTA_IIF]);

	if (tb[RTA_OIF])
4774
		oif = nla_get_u32(tb[RTA_OIF]);
L
Linus Torvalds 已提交
4775

4776 4777 4778
	if (tb[RTA_MARK])
		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);

4779 4780 4781 4782 4783 4784
	if (tb[RTA_UID])
		fl6.flowi6_uid = make_kuid(current_user_ns(),
					   nla_get_u32(tb[RTA_UID]));
	else
		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();

L
Linus Torvalds 已提交
4785 4786
	if (iif) {
		struct net_device *dev;
4787 4788
		int flags = 0;

4789 4790 4791
		rcu_read_lock();

		dev = dev_get_by_index_rcu(net, iif);
L
Linus Torvalds 已提交
4792
		if (!dev) {
4793
			rcu_read_unlock();
L
Linus Torvalds 已提交
4794
			err = -ENODEV;
4795
			goto errout;
L
Linus Torvalds 已提交
4796
		}
4797 4798 4799 4800 4801 4802

		fl6.flowi6_iif = iif;

		if (!ipv6_addr_any(&fl6.saddr))
			flags |= RT6_LOOKUP_F_HAS_SADDR;

D
David Ahern 已提交
4803
		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4804 4805

		rcu_read_unlock();
4806 4807 4808
	} else {
		fl6.flowi6_oif = oif;

4809
		dst = ip6_route_output(net, NULL, &fl6);
4810 4811 4812 4813 4814 4815 4816 4817
	}


	rt = container_of(dst, struct rt6_info, dst);
	if (rt->dst.error) {
		err = rt->dst.error;
		ip6_rt_put(rt);
		goto errout;
L
Linus Torvalds 已提交
4818 4819
	}

4820 4821 4822 4823 4824 4825
	if (rt == net->ipv6.ip6_null_entry) {
		err = rt->dst.error;
		ip6_rt_put(rt);
		goto errout;
	}

4826
	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4827
	if (!skb) {
A
Amerigo Wang 已提交
4828
		ip6_rt_put(rt);
4829 4830 4831
		err = -ENOBUFS;
		goto errout;
	}
L
Linus Torvalds 已提交
4832

4833
	skb_dst_set(skb, &rt->dst);
4834 4835 4836 4837

	rcu_read_lock();
	from = rcu_dereference(rt->from);

4838
	if (fibmatch)
4839
		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4840 4841 4842
				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
				    nlh->nlmsg_seq, 0);
	else
4843 4844
		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
				    &fl6.saddr, iif, RTM_NEWROUTE,
4845 4846
				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
				    0);
4847 4848
	rcu_read_unlock();

L
Linus Torvalds 已提交
4849
	if (err < 0) {
4850 4851
		kfree_skb(skb);
		goto errout;
L
Linus Torvalds 已提交
4852 4853
	}

4854
	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4855
errout:
L
Linus Torvalds 已提交
4856 4857 4858
	return err;
}

4859
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4860
		     unsigned int nlm_flags)
L
Linus Torvalds 已提交
4861 4862
{
	struct sk_buff *skb;
4863
	struct net *net = info->nl_net;
4864 4865 4866 4867
	u32 seq;
	int err;

	err = -ENOBUFS;
4868
	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4869

4870
	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4871
	if (!skb)
4872 4873
		goto errout;

4874 4875
	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
			    event, info->portid, seq, nlm_flags);
4876 4877 4878 4879 4880 4881
	if (err < 0) {
		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
4882
	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
4883 4884
		    info->nlh, gfp_any());
	return;
4885 4886
errout:
	if (err < 0)
4887
		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
L
Linus Torvalds 已提交
4888 4889
}

4890
static int ip6_route_dev_notify(struct notifier_block *this,
4891
				unsigned long event, void *ptr)
4892
{
4893
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
4894
	struct net *net = dev_net(dev);
4895

4896 4897 4898 4899
	if (!(dev->flags & IFF_LOOPBACK))
		return NOTIFY_OK;

	if (event == NETDEV_REGISTER) {
D
David Ahern 已提交
4900
		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
4901
		net->ipv6.ip6_null_entry->dst.dev = dev;
4902 4903
		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4904
		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
4905
		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
4906
		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
4907
		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
4908
#endif
4909 4910 4911 4912 4913
	 } else if (event == NETDEV_UNREGISTER &&
		    dev->reg_state != NETREG_UNREGISTERED) {
		/* NETDEV_UNREGISTER could be fired for multiple times by
		 * netdev_wait_allrefs(). Make sure we only call this once.
		 */
4914
		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
4915
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
4916 4917
		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
4918 4919 4920 4921 4922 4923
#endif
	}

	return NOTIFY_OK;
}

L
Linus Torvalds 已提交
4924 4925 4926 4927 4928 4929
/*
 *	/proc
 */

#ifdef CONFIG_PROC_FS

4930 4931 4932 4933
static const struct file_operations ipv6_route_proc_fops = {
	.open		= ipv6_route_open,
	.read		= seq_read,
	.llseek		= seq_lseek,
4934
	.release	= seq_release_net,
4935 4936
};

L
Linus Torvalds 已提交
4937 4938
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
4939
	struct net *net = (struct net *)seq->private;
L
Linus Torvalds 已提交
4940
	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
4941 4942
		   net->ipv6.rt6_stats->fib_nodes,
		   net->ipv6.rt6_stats->fib_route_nodes,
W
Wei Wang 已提交
4943
		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
4944 4945
		   net->ipv6.rt6_stats->fib_rt_entries,
		   net->ipv6.rt6_stats->fib_rt_cache,
4946
		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
4947
		   net->ipv6.rt6_stats->fib_discarded_routes);
L
Linus Torvalds 已提交
4948 4949 4950 4951 4952 4953

	return 0;
}

static int rt6_stats_seq_open(struct inode *inode, struct file *file)
{
4954
	return single_open_net(inode, file, rt6_stats_seq_show);
4955 4956
}

4957
static const struct file_operations rt6_stats_seq_fops = {
L
Linus Torvalds 已提交
4958 4959 4960
	.open	 = rt6_stats_seq_open,
	.read	 = seq_read,
	.llseek	 = seq_lseek,
4961
	.release = single_release_net,
L
Linus Torvalds 已提交
4962 4963 4964 4965 4966 4967
};
#endif	/* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static
4968
int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
L
Linus Torvalds 已提交
4969 4970
			      void __user *buffer, size_t *lenp, loff_t *ppos)
{
4971 4972 4973
	struct net *net;
	int delay;
	if (!write)
L
Linus Torvalds 已提交
4974
		return -EINVAL;
4975 4976 4977 4978

	net = (struct net *)ctl->extra1;
	delay = net->ipv6.sysctl.flush_delay;
	proc_dointvec(ctl, write, buffer, lenp, ppos);
4979
	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
4980
	return 0;
L
Linus Torvalds 已提交
4981 4982
}

4983
struct ctl_table ipv6_route_table_template[] = {
4984
	{
L
Linus Torvalds 已提交
4985
		.procname	=	"flush",
4986
		.data		=	&init_net.ipv6.sysctl.flush_delay,
L
Linus Torvalds 已提交
4987
		.maxlen		=	sizeof(int),
4988
		.mode		=	0200,
A
Alexey Dobriyan 已提交
4989
		.proc_handler	=	ipv6_sysctl_rtcache_flush
L
Linus Torvalds 已提交
4990 4991 4992
	},
	{
		.procname	=	"gc_thresh",
4993
		.data		=	&ip6_dst_ops_template.gc_thresh,
L
Linus Torvalds 已提交
4994 4995
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
4996
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
4997 4998 4999
	},
	{
		.procname	=	"max_size",
5000
		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
L
Linus Torvalds 已提交
5001 5002
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5003
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5004 5005 5006
	},
	{
		.procname	=	"gc_min_interval",
5007
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
5008 5009
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5010
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5011 5012 5013
	},
	{
		.procname	=	"gc_timeout",
5014
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
L
Linus Torvalds 已提交
5015 5016
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5017
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5018 5019 5020
	},
	{
		.procname	=	"gc_interval",
5021
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
L
Linus Torvalds 已提交
5022 5023
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5024
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5025 5026 5027
	},
	{
		.procname	=	"gc_elasticity",
5028
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
L
Linus Torvalds 已提交
5029 5030
		.maxlen		=	sizeof(int),
		.mode		=	0644,
5031
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5032 5033 5034
	},
	{
		.procname	=	"mtu_expires",
5035
		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
L
Linus Torvalds 已提交
5036 5037
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5038
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5039 5040 5041
	},
	{
		.procname	=	"min_adv_mss",
5042
		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
L
Linus Torvalds 已提交
5043 5044
		.maxlen		=	sizeof(int),
		.mode		=	0644,
5045
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5046 5047 5048
	},
	{
		.procname	=	"gc_min_interval_ms",
5049
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
5050 5051
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5052
		.proc_handler	=	proc_dointvec_ms_jiffies,
L
Linus Torvalds 已提交
5053
	},
5054
	{ }
L
Linus Torvalds 已提交
5055 5056
};

5057
struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5058 5059 5060 5061 5062 5063
{
	struct ctl_table *table;

	table = kmemdup(ipv6_route_table_template,
			sizeof(ipv6_route_table_template),
			GFP_KERNEL);
5064 5065 5066

	if (table) {
		table[0].data = &net->ipv6.sysctl.flush_delay;
5067
		table[0].extra1 = net;
5068
		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5069 5070 5071 5072 5073 5074 5075
		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5076
		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5077 5078 5079 5080

		/* Don't export sysctls to unprivileged users */
		if (net->user_ns != &init_user_ns)
			table[0].procname = NULL;
5081 5082
	}

5083 5084
	return table;
}
L
Linus Torvalds 已提交
5085 5086
#endif

5087
static int __net_init ip6_route_net_init(struct net *net)
5088
{
5089
	int ret = -ENOMEM;
5090

5091 5092
	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
	       sizeof(net->ipv6.ip6_dst_ops));
5093

5094 5095 5096
	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
		goto out_ip6_dst_ops;

D
David Ahern 已提交
5097 5098 5099 5100 5101 5102
	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
					    sizeof(*net->ipv6.fib6_null_entry),
					    GFP_KERNEL);
	if (!net->ipv6.fib6_null_entry)
		goto out_ip6_dst_entries;

5103 5104 5105 5106
	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
					   sizeof(*net->ipv6.ip6_null_entry),
					   GFP_KERNEL);
	if (!net->ipv6.ip6_null_entry)
D
David Ahern 已提交
5107
		goto out_fib6_null_entry;
5108
	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5109 5110
	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
			 ip6_template_metrics, true);
5111 5112

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5113
	net->ipv6.fib6_has_custom_rules = false;
5114 5115 5116
	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
					       sizeof(*net->ipv6.ip6_prohibit_entry),
					       GFP_KERNEL);
5117 5118
	if (!net->ipv6.ip6_prohibit_entry)
		goto out_ip6_null_entry;
5119
	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5120 5121
	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
			 ip6_template_metrics, true);
5122 5123 5124 5125

	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
					       sizeof(*net->ipv6.ip6_blk_hole_entry),
					       GFP_KERNEL);
5126 5127
	if (!net->ipv6.ip6_blk_hole_entry)
		goto out_ip6_prohibit_entry;
5128
	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5129 5130
	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
			 ip6_template_metrics, true);
5131 5132
#endif

5133 5134 5135 5136 5137 5138 5139 5140 5141
	net->ipv6.sysctl.flush_delay = 0;
	net->ipv6.sysctl.ip6_rt_max_size = 4096;
	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;

5142 5143
	net->ipv6.ip6_rt_gc_expire = 30*HZ;

5144 5145 5146
	ret = 0;
out:
	return ret;
5147

5148 5149 5150 5151 5152 5153
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
	kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
	kfree(net->ipv6.ip6_null_entry);
#endif
D
David Ahern 已提交
5154 5155
out_fib6_null_entry:
	kfree(net->ipv6.fib6_null_entry);
5156 5157
out_ip6_dst_entries:
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5158 5159
out_ip6_dst_ops:
	goto out;
5160 5161
}

5162
static void __net_exit ip6_route_net_exit(struct net *net)
5163
{
D
David Ahern 已提交
5164
	kfree(net->ipv6.fib6_null_entry);
5165 5166 5167 5168 5169
	kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	kfree(net->ipv6.ip6_prohibit_entry);
	kfree(net->ipv6.ip6_blk_hole_entry);
#endif
5170
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5171 5172
}

5173 5174 5175
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
5176
	proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
5177
	proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
5178 5179 5180 5181 5182 5183 5184
#endif
	return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
5185 5186
	remove_proc_entry("ipv6_route", net->proc_net);
	remove_proc_entry("rt6_stats", net->proc_net);
5187 5188 5189
#endif
}

5190 5191 5192 5193 5194
static struct pernet_operations ip6_route_net_ops = {
	.init = ip6_route_net_init,
	.exit = ip6_route_net_exit,
};

5195 5196 5197 5198 5199 5200 5201 5202 5203 5204 5205 5206 5207 5208 5209 5210
static int __net_init ipv6_inetpeer_init(struct net *net)
{
	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

	if (!bp)
		return -ENOMEM;
	inet_peer_base_init(bp);
	net->ipv6.peers = bp;
	return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
	struct inet_peer_base *bp = net->ipv6.peers;

	net->ipv6.peers = NULL;
5211
	inetpeer_invalidate_tree(bp);
5212 5213 5214
	kfree(bp);
}

5215
static struct pernet_operations ipv6_inetpeer_ops = {
5216 5217 5218 5219
	.init	=	ipv6_inetpeer_init,
	.exit	=	ipv6_inetpeer_exit,
};

5220 5221 5222 5223 5224
static struct pernet_operations ip6_route_net_late_ops = {
	.init = ip6_route_net_init_late,
	.exit = ip6_route_net_exit_late,
};

5225 5226
static struct notifier_block ip6_route_dev_notifier = {
	.notifier_call = ip6_route_dev_notify,
5227
	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5228 5229
};

5230 5231 5232 5233 5234
void __init ip6_route_init_special_entries(void)
{
	/* Registering of the loopback is done before this portion of code,
	 * the loopback reference in rt6_info will not be taken, do it
	 * manually for init_net */
D
David Ahern 已提交
5235
	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5236 5237 5238 5239 5240 5241 5242 5243 5244 5245
	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
}

5246
int __init ip6_route_init(void)
L
Linus Torvalds 已提交
5247
{
5248
	int ret;
5249
	int cpu;
5250

5251 5252
	ret = -ENOMEM;
	ip6_dst_ops_template.kmem_cachep =
A
Alexey Dobriyan 已提交
5253
		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5254
				  SLAB_HWCACHE_ALIGN, NULL);
5255
	if (!ip6_dst_ops_template.kmem_cachep)
5256
		goto out;
5257

5258
	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5259
	if (ret)
5260 5261
		goto out_kmem_cache;

5262 5263
	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
	if (ret)
5264
		goto out_dst_entries;
5265

5266 5267 5268
	ret = register_pernet_subsys(&ip6_route_net_ops);
	if (ret)
		goto out_register_inetpeer;
5269

5270 5271
	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

5272
	ret = fib6_init();
5273
	if (ret)
5274
		goto out_register_subsys;
5275 5276 5277

	ret = xfrm6_init();
	if (ret)
5278
		goto out_fib6_init;
5279

5280 5281 5282
	ret = fib6_rules_init();
	if (ret)
		goto xfrm6_init;
5283

5284 5285 5286 5287
	ret = register_pernet_subsys(&ip6_route_net_late_ops);
	if (ret)
		goto fib6_rules_init;

5288 5289 5290 5291 5292 5293 5294 5295 5296 5297 5298 5299 5300 5301
	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
				   inet6_rtm_newroute, NULL, 0);
	if (ret < 0)
		goto out_register_late_subsys;

	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
				   inet6_rtm_delroute, NULL, 0);
	if (ret < 0)
		goto out_register_late_subsys;

	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
				   inet6_rtm_getroute, NULL,
				   RTNL_FLAG_DOIT_UNLOCKED);
	if (ret < 0)
5302
		goto out_register_late_subsys;
5303

5304
	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5305
	if (ret)
5306
		goto out_register_late_subsys;
5307

5308 5309 5310 5311 5312 5313 5314
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

		INIT_LIST_HEAD(&ul->head);
		spin_lock_init(&ul->lock);
	}

5315 5316 5317
out:
	return ret;

5318
out_register_late_subsys:
5319
	rtnl_unregister_all(PF_INET6);
5320
	unregister_pernet_subsys(&ip6_route_net_late_ops);
5321 5322 5323 5324
fib6_rules_init:
	fib6_rules_cleanup();
xfrm6_init:
	xfrm6_fini();
5325 5326
out_fib6_init:
	fib6_gc_cleanup();
5327 5328
out_register_subsys:
	unregister_pernet_subsys(&ip6_route_net_ops);
5329 5330
out_register_inetpeer:
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5331 5332
out_dst_entries:
	dst_entries_destroy(&ip6_dst_blackhole_ops);
5333
out_kmem_cache:
5334
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5335
	goto out;
L
Linus Torvalds 已提交
5336 5337 5338 5339
}

void ip6_route_cleanup(void)
{
5340
	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5341
	unregister_pernet_subsys(&ip6_route_net_late_ops);
T
Thomas Graf 已提交
5342
	fib6_rules_cleanup();
L
Linus Torvalds 已提交
5343 5344
	xfrm6_fini();
	fib6_gc_cleanup();
5345
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5346
	unregister_pernet_subsys(&ip6_route_net_ops);
5347
	dst_entries_destroy(&ip6_dst_blackhole_ops);
5348
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
L
Linus Torvalds 已提交
5349
}