route.c 132.9 KB
Newer Older
L
Linus Torvalds 已提交
1 2 3 4 5
/*
 *	Linux INET6 implementation
 *	FIB front-end.
 *
 *	Authors:
6
 *	Pedro Roque		<roque@di.fc.ul.pt>
L
Linus Torvalds 已提交
7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
 *
 *	This program is free software; you can redistribute it and/or
 *      modify it under the terms of the GNU General Public License
 *      as published by the Free Software Foundation; either version
 *      2 of the License, or (at your option) any later version.
 */

/*	Changes:
 *
 *	YOSHIFUJI Hideaki @USAGI
 *		reworked default router selection.
 *		- respect outgoing interface
 *		- select from (probably) reachable routers (i.e.
 *		routers in REACHABLE, STALE, DELAY or PROBE states).
 *		- always select the same router if it is (probably)
 *		reachable.  otherwise, round-robin the list.
23 24
 *	Ville Nuorvala
 *		Fixed routing subtrees.
L
Linus Torvalds 已提交
25 26
 */

27 28
#define pr_fmt(fmt) "IPv6: " fmt

29
#include <linux/capability.h>
L
Linus Torvalds 已提交
30
#include <linux/errno.h>
31
#include <linux/export.h>
L
Linus Torvalds 已提交
32 33 34 35 36 37 38 39
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
40
#include <linux/mroute6.h>
L
Linus Torvalds 已提交
41 42 43 44
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
45
#include <linux/nsproxy.h>
46
#include <linux/slab.h>
47
#include <linux/jhash.h>
48
#include <net/net_namespace.h>
L
Linus Torvalds 已提交
49 50 51 52 53 54 55 56 57
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
58
#include <net/dst_metadata.h>
L
Linus Torvalds 已提交
59
#include <net/xfrm.h>
60
#include <net/netevent.h>
61
#include <net/netlink.h>
62
#include <net/nexthop.h>
63
#include <net/lwtunnel.h>
64
#include <net/ip_tunnels.h>
D
David Ahern 已提交
65
#include <net/l3mdev.h>
66
#include <net/ip.h>
67
#include <linux/uaccess.h>
L
Linus Torvalds 已提交
68 69 70 71 72

#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif

73 74 75 76 77 78 79
static int ip6_rt_type_to_error(u8 fib6_type);

#define CREATE_TRACE_POINTS
#include <trace/events/fib6.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
#undef CREATE_TRACE_POINTS

80
enum rt6_nud_state {
J
Jiri Benc 已提交
81 82 83
	RT6_NUD_FAIL_HARD = -3,
	RT6_NUD_FAIL_PROBE = -2,
	RT6_NUD_FAIL_DO_RR = -1,
84 85 86
	RT6_NUD_SUCCEED = 1
};

L
Linus Torvalds 已提交
87
static struct dst_entry	*ip6_dst_check(struct dst_entry *dst, u32 cookie);
88
static unsigned int	 ip6_default_advmss(const struct dst_entry *dst);
89
static unsigned int	 ip6_mtu(const struct dst_entry *dst);
L
Linus Torvalds 已提交
90 91 92 93
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void		ip6_dst_destroy(struct dst_entry *);
static void		ip6_dst_ifdown(struct dst_entry *,
				       struct net_device *dev, int how);
94
static int		 ip6_dst_gc(struct dst_ops *ops);
L
Linus Torvalds 已提交
95 96

static int		ip6_pkt_discard(struct sk_buff *skb);
E
Eric W. Biederman 已提交
97
static int		ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
98
static int		ip6_pkt_prohibit(struct sk_buff *skb);
E
Eric W. Biederman 已提交
99
static int		ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
L
Linus Torvalds 已提交
100
static void		ip6_link_failure(struct sk_buff *skb);
101 102 103 104
static void		ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
					   struct sk_buff *skb, u32 mtu);
static void		rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
					struct sk_buff *skb);
105 106
static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
static size_t rt6_nlmsg_size(struct fib6_info *rt);
107
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
108
			 struct fib6_info *rt, struct dst_entry *dst,
109
			 struct in6_addr *dest, struct in6_addr *src,
110 111
			 int iif, int type, u32 portid, u32 seq,
			 unsigned int flags);
112
static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
113 114
					   struct in6_addr *daddr,
					   struct in6_addr *saddr);
L
Linus Torvalds 已提交
115

116
#ifdef CONFIG_IPV6_ROUTE_INFO
117
static struct fib6_info *rt6_add_route_info(struct net *net,
118
					   const struct in6_addr *prefix, int prefixlen,
119 120
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
121
					   unsigned int pref);
122
static struct fib6_info *rt6_get_route_info(struct net *net,
123
					   const struct in6_addr *prefix, int prefixlen,
124 125
					   const struct in6_addr *gwaddr,
					   struct net_device *dev);
126 127
#endif

128 129 130 131 132 133 134
struct uncached_list {
	spinlock_t		lock;
	struct list_head	head;
};

static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);

135
void rt6_uncached_list_add(struct rt6_info *rt)
136 137 138 139 140 141 142 143 144 145
{
	struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);

	rt->rt6i_uncached_list = ul;

	spin_lock_bh(&ul->lock);
	list_add_tail(&rt->rt6i_uncached, &ul->head);
	spin_unlock_bh(&ul->lock);
}

146
void rt6_uncached_list_del(struct rt6_info *rt)
147 148 149
{
	if (!list_empty(&rt->rt6i_uncached)) {
		struct uncached_list *ul = rt->rt6i_uncached_list;
W
Wei Wang 已提交
150
		struct net *net = dev_net(rt->dst.dev);
151 152 153

		spin_lock_bh(&ul->lock);
		list_del(&rt->rt6i_uncached);
W
Wei Wang 已提交
154
		atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
155 156 157 158 159 160 161 162 163
		spin_unlock_bh(&ul->lock);
	}
}

static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
	struct net_device *loopback_dev = net->loopback_dev;
	int cpu;

164 165 166
	if (dev == loopback_dev)
		return;

167 168 169 170 171 172 173 174 175
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
		struct rt6_info *rt;

		spin_lock_bh(&ul->lock);
		list_for_each_entry(rt, &ul->head, rt6i_uncached) {
			struct inet6_dev *rt_idev = rt->rt6i_idev;
			struct net_device *rt_dev = rt->dst.dev;

176
			if (rt_idev->dev == dev) {
177 178 179 180
				rt->rt6i_idev = in6_dev_get(loopback_dev);
				in6_dev_put(rt_idev);
			}

181
			if (rt_dev == dev) {
182 183 184 185 186 187 188 189 190
				rt->dst.dev = loopback_dev;
				dev_hold(rt->dst.dev);
				dev_put(rt_dev);
			}
		}
		spin_unlock_bh(&ul->lock);
	}
}

191
static inline const void *choose_neigh_daddr(const struct in6_addr *p,
192 193
					     struct sk_buff *skb,
					     const void *daddr)
194
{
D
David S. Miller 已提交
195
	if (!ipv6_addr_any(p))
196
		return (const void *) p;
197 198
	else if (skb)
		return &ipv6_hdr(skb)->daddr;
199 200 201
	return daddr;
}

202 203 204 205
struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
				   struct net_device *dev,
				   struct sk_buff *skb,
				   const void *daddr)
206
{
207 208
	struct neighbour *n;

209 210
	daddr = choose_neigh_daddr(gw, skb, daddr);
	n = __ipv6_neigh_lookup(dev, daddr);
211 212
	if (n)
		return n;
213 214 215 216 217 218 219 220 221 222
	return neigh_create(&nd_tbl, daddr, dev);
}

static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
					      struct sk_buff *skb,
					      const void *daddr)
{
	const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);

	return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
223 224
}

225 226 227 228 229
static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
	struct net_device *dev = dst->dev;
	struct rt6_info *rt = (struct rt6_info *)dst;

230
	daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
231 232 233 234 235 236 237 238 239
	if (!daddr)
		return;
	if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
		return;
	if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
		return;
	__ipv6_confirm_neigh(dev, daddr);
}

240
static struct dst_ops ip6_dst_ops_template = {
L
Linus Torvalds 已提交
241 242 243 244
	.family			=	AF_INET6,
	.gc			=	ip6_dst_gc,
	.gc_thresh		=	1024,
	.check			=	ip6_dst_check,
245
	.default_advmss		=	ip6_default_advmss,
246
	.mtu			=	ip6_mtu,
247
	.cow_metrics		=	dst_cow_metrics_generic,
L
Linus Torvalds 已提交
248 249 250 251 252
	.destroy		=	ip6_dst_destroy,
	.ifdown			=	ip6_dst_ifdown,
	.negative_advice	=	ip6_negative_advice,
	.link_failure		=	ip6_link_failure,
	.update_pmtu		=	ip6_rt_update_pmtu,
253
	.redirect		=	rt6_do_redirect,
254
	.local_out		=	__ip6_local_out,
255
	.neigh_lookup		=	ip6_dst_neigh_lookup,
256
	.confirm_neigh		=	ip6_confirm_neigh,
L
Linus Torvalds 已提交
257 258
};

259
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
260
{
261 262 263
	unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);

	return mtu ? : dst->dev->mtu;
264 265
}

266 267
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
					 struct sk_buff *skb, u32 mtu)
268 269 270
{
}

271 272
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
				      struct sk_buff *skb)
273 274 275
{
}

276 277 278 279
static struct dst_ops ip6_dst_blackhole_ops = {
	.family			=	AF_INET6,
	.destroy		=	ip6_dst_destroy,
	.check			=	ip6_dst_check,
280
	.mtu			=	ip6_blackhole_mtu,
281
	.default_advmss		=	ip6_default_advmss,
282
	.update_pmtu		=	ip6_rt_blackhole_update_pmtu,
283
	.redirect		=	ip6_rt_blackhole_redirect,
284
	.cow_metrics		=	dst_cow_metrics_generic,
285
	.neigh_lookup		=	ip6_dst_neigh_lookup,
286 287
};

288
static const u32 ip6_template_metrics[RTAX_MAX] = {
L
Li RongQing 已提交
289
	[RTAX_HOPLIMIT - 1] = 0,
290 291
};

292
static const struct fib6_info fib6_null_entry_template = {
293 294 295 296
	.fib6_flags	= (RTF_REJECT | RTF_NONEXTHOP),
	.fib6_protocol  = RTPROT_KERNEL,
	.fib6_metric	= ~(u32)0,
	.fib6_ref	= ATOMIC_INIT(1),
D
David Ahern 已提交
297 298 299 300
	.fib6_type	= RTN_UNREACHABLE,
	.fib6_metrics	= (struct dst_metrics *)&dst_default_metrics,
};

301
static const struct rt6_info ip6_null_entry_template = {
302 303 304
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
305
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
306 307 308
		.error		= -ENETUNREACH,
		.input		= ip6_pkt_discard,
		.output		= ip6_pkt_discard_out,
L
Linus Torvalds 已提交
309 310 311 312
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

T
Thomas Graf 已提交
313 314
#ifdef CONFIG_IPV6_MULTIPLE_TABLES

315
static const struct rt6_info ip6_prohibit_entry_template = {
316 317 318
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
319
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
320 321 322
		.error		= -EACCES,
		.input		= ip6_pkt_prohibit,
		.output		= ip6_pkt_prohibit_out,
T
Thomas Graf 已提交
323 324 325 326
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

327
static const struct rt6_info ip6_blk_hole_entry_template = {
328 329 330
	.dst = {
		.__refcnt	= ATOMIC_INIT(1),
		.__use		= 1,
331
		.obsolete	= DST_OBSOLETE_FORCE_CHK,
332 333
		.error		= -EINVAL,
		.input		= dst_discard,
E
Eric W. Biederman 已提交
334
		.output		= dst_discard_out,
T
Thomas Graf 已提交
335 336 337 338 339 340
	},
	.rt6i_flags	= (RTF_REJECT | RTF_NONEXTHOP),
};

#endif

341 342 343 344 345 346 347 348
static void rt6_info_init(struct rt6_info *rt)
{
	struct dst_entry *dst = &rt->dst;

	memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
	INIT_LIST_HEAD(&rt->rt6i_uncached);
}

L
Linus Torvalds 已提交
349
/* allocate dst with ip6_dst_ops */
350 351
struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
			       int flags)
L
Linus Torvalds 已提交
352
{
353
	struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
W
Wei Wang 已提交
354
					1, DST_OBSOLETE_FORCE_CHK, flags);
355

W
Wei Wang 已提交
356
	if (rt) {
357
		rt6_info_init(rt);
W
Wei Wang 已提交
358 359
		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
	}
360

361
	return rt;
L
Linus Torvalds 已提交
362
}
363
EXPORT_SYMBOL(ip6_dst_alloc);
M
Martin KaFai Lau 已提交
364

L
Linus Torvalds 已提交
365 366
static void ip6_dst_destroy(struct dst_entry *dst)
{
367
	struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
L
Linus Torvalds 已提交
368
	struct rt6_info *rt = (struct rt6_info *)dst;
369
	struct fib6_info *from;
370
	struct inet6_dev *idev;
L
Linus Torvalds 已提交
371

372 373 374
	if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
		kfree(p);

375 376 377
	rt6_uncached_list_del(rt);

	idev = rt->rt6i_idev;
378
	if (idev) {
L
Linus Torvalds 已提交
379 380
		rt->rt6i_idev = NULL;
		in6_dev_put(idev);
381
	}
382

383 384 385
	rcu_read_lock();
	from = rcu_dereference(rt->from);
	rcu_assign_pointer(rt->from, NULL);
386
	fib6_info_release(from);
387
	rcu_read_unlock();
388 389
}

L
Linus Torvalds 已提交
390 391 392 393 394
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
			   int how)
{
	struct rt6_info *rt = (struct rt6_info *)dst;
	struct inet6_dev *idev = rt->rt6i_idev;
395
	struct net_device *loopback_dev =
396
		dev_net(dev)->loopback_dev;
L
Linus Torvalds 已提交
397

398 399 400 401 402
	if (idev && idev->dev != loopback_dev) {
		struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
		if (loopback_idev) {
			rt->rt6i_idev = loopback_idev;
			in6_dev_put(idev);
403
		}
L
Linus Torvalds 已提交
404 405 406
	}
}

407 408 409 410 411 412 413 414
static bool __rt6_check_expired(const struct rt6_info *rt)
{
	if (rt->rt6i_flags & RTF_EXPIRES)
		return time_after(jiffies, rt->dst.expires);
	else
		return false;
}

415
static bool rt6_check_expired(const struct rt6_info *rt)
L
Linus Torvalds 已提交
416
{
417 418 419 420
	struct fib6_info *from;

	from = rcu_dereference(rt->from);

421 422
	if (rt->rt6i_flags & RTF_EXPIRES) {
		if (time_after(jiffies, rt->dst.expires))
423
			return true;
424
	} else if (from) {
425
		return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
426
			fib6_check_expired(from);
427
	}
428
	return false;
L
Linus Torvalds 已提交
429 430
}

431 432 433 434 435
struct fib6_info *fib6_multipath_select(const struct net *net,
					struct fib6_info *match,
					struct flowi6 *fl6, int oif,
					const struct sk_buff *skb,
					int strict)
436
{
437
	struct fib6_info *sibling, *next_sibling;
438

439 440 441 442
	/* We might have already computed the hash for ICMPv6 errors. In such
	 * case it will always be non-zero. Otherwise now is the time to do it.
	 */
	if (!fl6->mp_hash)
443
		fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
444

445
	if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
446 447
		return match;

448 449
	list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
				 fib6_siblings) {
450 451 452 453
		int nh_upper_bound;

		nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
		if (fl6->mp_hash > nh_upper_bound)
454 455 456 457 458 459 460
			continue;
		if (rt6_score_route(sibling, oif, strict) < 0)
			break;
		match = sibling;
		break;
	}

461 462 463
	return match;
}

L
Linus Torvalds 已提交
464
/*
465
 *	Route lookup. rcu_read_lock() should be held.
L
Linus Torvalds 已提交
466 467
 */

468 469
static inline struct fib6_info *rt6_device_match(struct net *net,
						 struct fib6_info *rt,
470
						    const struct in6_addr *saddr,
L
Linus Torvalds 已提交
471
						    int oif,
472
						    int flags)
L
Linus Torvalds 已提交
473
{
474
	struct fib6_info *sprt;
L
Linus Torvalds 已提交
475

476 477
	if (!oif && ipv6_addr_any(saddr) &&
	    !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
478
		return rt;
479

480
	for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
481
		const struct net_device *dev = sprt->fib6_nh.nh_dev;
482

483
		if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
484 485
			continue;

486
		if (oif) {
L
Linus Torvalds 已提交
487 488
			if (dev->ifindex == oif)
				return sprt;
489 490 491 492
		} else {
			if (ipv6_chk_addr(net, saddr, dev,
					  flags & RT6_LOOKUP_F_IFACE))
				return sprt;
L
Linus Torvalds 已提交
493
		}
494
	}
L
Linus Torvalds 已提交
495

496 497
	if (oif && flags & RT6_LOOKUP_F_IFACE)
		return net->ipv6.fib6_null_entry;
498

D
David Ahern 已提交
499
	return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
L
Linus Torvalds 已提交
500 501
}

502
#ifdef CONFIG_IPV6_ROUTER_PREF
503 504 505 506 507 508 509 510 511 512 513 514 515
struct __rt6_probe_work {
	struct work_struct work;
	struct in6_addr target;
	struct net_device *dev;
};

static void rt6_probe_deferred(struct work_struct *w)
{
	struct in6_addr mcaddr;
	struct __rt6_probe_work *work =
		container_of(w, struct __rt6_probe_work, work);

	addrconf_addr_solict_mult(&work->target, &mcaddr);
516
	ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
517
	dev_put(work->dev);
518
	kfree(work);
519 520
}

521
static void rt6_probe(struct fib6_info *rt)
522
{
523
	struct __rt6_probe_work *work = NULL;
524
	const struct in6_addr *nh_gw;
525
	struct neighbour *neigh;
526
	struct net_device *dev;
527
	struct inet6_dev *idev;
528

529 530 531 532 533 534 535 536
	/*
	 * Okay, this does not seem to be appropriate
	 * for now, however, we need to check if it
	 * is really so; aka Router Reachability Probing.
	 *
	 * Router Reachability Probe MUST be rate-limited
	 * to no more than one per minute.
	 */
537
	if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
538
		return;
539 540 541

	nh_gw = &rt->fib6_nh.nh_gw;
	dev = rt->fib6_nh.nh_dev;
542
	rcu_read_lock_bh();
543
	idev = __in6_dev_get(dev);
544
	neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
545
	if (neigh) {
546 547 548
		if (neigh->nud_state & NUD_VALID)
			goto out;

549
		write_lock(&neigh->lock);
550 551
		if (!(neigh->nud_state & NUD_VALID) &&
		    time_after(jiffies,
D
David Ahern 已提交
552
			       neigh->updated + idev->cnf.rtr_probe_interval)) {
553 554 555
			work = kmalloc(sizeof(*work), GFP_ATOMIC);
			if (work)
				__neigh_set_probe_once(neigh);
556
		}
557
		write_unlock(&neigh->lock);
558 559
	} else if (time_after(jiffies, rt->last_probe +
				       idev->cnf.rtr_probe_interval)) {
560
		work = kmalloc(sizeof(*work), GFP_ATOMIC);
561
	}
562 563

	if (work) {
564
		rt->last_probe = jiffies;
565
		INIT_WORK(&work->work, rt6_probe_deferred);
566 567 568
		work->target = *nh_gw;
		dev_hold(dev);
		work->dev = dev;
569 570 571
		schedule_work(&work->work);
	}

572
out:
573
	rcu_read_unlock_bh();
574 575
}
#else
576
static inline void rt6_probe(struct fib6_info *rt)
577 578 579 580
{
}
#endif

L
Linus Torvalds 已提交
581
/*
582
 * Default Router Selection (RFC 2461 6.3.6)
L
Linus Torvalds 已提交
583
 */
584
static inline int rt6_check_dev(struct fib6_info *rt, int oif)
585
{
586 587
	const struct net_device *dev = rt->fib6_nh.nh_dev;

588
	if (!oif || dev->ifindex == oif)
589
		return 2;
590
	return 0;
591
}
L
Linus Torvalds 已提交
592

593
static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
L
Linus Torvalds 已提交
594
{
595
	enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
596
	struct neighbour *neigh;
597

598 599
	if (rt->fib6_flags & RTF_NONEXTHOP ||
	    !(rt->fib6_flags & RTF_GATEWAY))
600
		return RT6_NUD_SUCCEED;
601 602

	rcu_read_lock_bh();
603 604
	neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
					  &rt->fib6_nh.nh_gw);
605 606
	if (neigh) {
		read_lock(&neigh->lock);
607
		if (neigh->nud_state & NUD_VALID)
608
			ret = RT6_NUD_SUCCEED;
609
#ifdef CONFIG_IPV6_ROUTER_PREF
610
		else if (!(neigh->nud_state & NUD_FAILED))
611
			ret = RT6_NUD_SUCCEED;
J
Jiri Benc 已提交
612 613
		else
			ret = RT6_NUD_FAIL_PROBE;
614
#endif
615
		read_unlock(&neigh->lock);
616 617
	} else {
		ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
J
Jiri Benc 已提交
618
		      RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
619
	}
620 621
	rcu_read_unlock_bh();

622
	return ret;
L
Linus Torvalds 已提交
623 624
}

625
static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
L
Linus Torvalds 已提交
626
{
627
	int m;
628

629
	m = rt6_check_dev(rt, oif);
630
	if (!m && (strict & RT6_LOOKUP_F_IFACE))
631
		return RT6_NUD_FAIL_HARD;
632
#ifdef CONFIG_IPV6_ROUTER_PREF
633
	m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
634
#endif
635 636 637 638 639
	if (strict & RT6_LOOKUP_F_REACHABLE) {
		int n = rt6_check_neigh(rt);
		if (n < 0)
			return n;
	}
640 641 642
	return m;
}

D
David Ahern 已提交
643 644 645 646 647 648 649 650 651 652 653 654 655 656 657
/* called with rc_read_lock held */
static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
{
	const struct net_device *dev = fib6_info_nh_dev(f6i);
	bool rc = false;

	if (dev) {
		const struct inet6_dev *idev = __in6_dev_get(dev);

		rc = !!idev->cnf.ignore_routes_with_linkdown;
	}

	return rc;
}

658 659
static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
				   int *mpri, struct fib6_info *match,
660
				   bool *do_rr)
661
{
662
	int m;
663
	bool match_do_rr = false;
664

665
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
666 667
		goto out;

D
David Ahern 已提交
668
	if (fib6_ignore_linkdown(rt) &&
669
	    rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
670
	    !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
671
		goto out;
672

673
	if (fib6_check_expired(rt))
674 675 676
		goto out;

	m = rt6_score_route(rt, oif, strict);
J
Jiri Benc 已提交
677
	if (m == RT6_NUD_FAIL_DO_RR) {
678 679
		match_do_rr = true;
		m = 0; /* lowest valid score */
J
Jiri Benc 已提交
680
	} else if (m == RT6_NUD_FAIL_HARD) {
681
		goto out;
682 683 684 685
	}

	if (strict & RT6_LOOKUP_F_REACHABLE)
		rt6_probe(rt);
686

J
Jiri Benc 已提交
687
	/* note that m can be RT6_NUD_FAIL_PROBE at this point */
688
	if (m > *mpri) {
689
		*do_rr = match_do_rr;
690 691 692 693 694 695 696
		*mpri = m;
		match = rt;
	}
out:
	return match;
}

697 698 699
static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
				     struct fib6_info *leaf,
				     struct fib6_info *rr_head,
700 701
				     u32 metric, int oif, int strict,
				     bool *do_rr)
702
{
703
	struct fib6_info *rt, *match, *cont;
704
	int mpri = -1;
L
Linus Torvalds 已提交
705

706
	match = NULL;
707
	cont = NULL;
708
	for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
709
		if (rt->fib6_metric != metric) {
710 711 712 713 714 715 716
			cont = rt;
			break;
		}

		match = find_match(rt, oif, strict, &mpri, match, do_rr);
	}

717
	for (rt = leaf; rt && rt != rr_head;
718
	     rt = rcu_dereference(rt->fib6_next)) {
719
		if (rt->fib6_metric != metric) {
720 721 722 723
			cont = rt;
			break;
		}

724
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
725 726 727 728 729
	}

	if (match || !cont)
		return match;

730
	for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
731
		match = find_match(rt, oif, strict, &mpri, match, do_rr);
L
Linus Torvalds 已提交
732

733 734
	return match;
}
L
Linus Torvalds 已提交
735

736
static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
W
Wei Wang 已提交
737
				   int oif, int strict)
738
{
739 740
	struct fib6_info *leaf = rcu_dereference(fn->leaf);
	struct fib6_info *match, *rt0;
741
	bool do_rr = false;
742
	int key_plen;
L
Linus Torvalds 已提交
743

D
David Ahern 已提交
744 745
	if (!leaf || leaf == net->ipv6.fib6_null_entry)
		return net->ipv6.fib6_null_entry;
W
Wei Wang 已提交
746

747
	rt0 = rcu_dereference(fn->rr_ptr);
748
	if (!rt0)
749
		rt0 = leaf;
L
Linus Torvalds 已提交
750

751 752 753 754 755
	/* Double check to make sure fn is not an intermediate node
	 * and fn->leaf does not points to its child's leaf
	 * (This might happen if all routes under fn are deleted from
	 * the tree and fib6_repair_tree() is called on the node.)
	 */
756
	key_plen = rt0->fib6_dst.plen;
757
#ifdef CONFIG_IPV6_SUBTREES
758 759
	if (rt0->fib6_src.plen)
		key_plen = rt0->fib6_src.plen;
760 761
#endif
	if (fn->fn_bit != key_plen)
D
David Ahern 已提交
762
		return net->ipv6.fib6_null_entry;
763

764
	match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
765
			     &do_rr);
L
Linus Torvalds 已提交
766

767
	if (do_rr) {
768
		struct fib6_info *next = rcu_dereference(rt0->fib6_next);
769

770
		/* no entries matched; do round-robin */
771
		if (!next || next->fib6_metric != rt0->fib6_metric)
W
Wei Wang 已提交
772
			next = leaf;
773

774
		if (next != rt0) {
775
			spin_lock_bh(&leaf->fib6_table->tb6_lock);
776
			/* make sure next is not being deleted from the tree */
777
			if (next->fib6_node)
778
				rcu_assign_pointer(fn->rr_ptr, next);
779
			spin_unlock_bh(&leaf->fib6_table->tb6_lock);
780
		}
L
Linus Torvalds 已提交
781 782
	}

D
David Ahern 已提交
783
	return match ? match : net->ipv6.fib6_null_entry;
L
Linus Torvalds 已提交
784 785
}

786
static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
787
{
788
	return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
789 790
}

791 792
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
793
		  const struct in6_addr *gwaddr)
794
{
795
	struct net *net = dev_net(dev);
796 797 798
	struct route_info *rinfo = (struct route_info *) opt;
	struct in6_addr prefix_buf, *prefix;
	unsigned int pref;
799
	unsigned long lifetime;
800
	struct fib6_info *rt;
801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822

	if (len < sizeof(struct route_info)) {
		return -EINVAL;
	}

	/* Sanity check for prefix_len and length */
	if (rinfo->length > 3) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 128) {
		return -EINVAL;
	} else if (rinfo->prefix_len > 64) {
		if (rinfo->length < 2) {
			return -EINVAL;
		}
	} else if (rinfo->prefix_len > 0) {
		if (rinfo->length < 1) {
			return -EINVAL;
		}
	}

	pref = rinfo->route_pref;
	if (pref == ICMPV6_ROUTER_PREF_INVALID)
823
		return -EINVAL;
824

825
	lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
826 827 828 829 830 831 832 833 834 835 836

	if (rinfo->length == 3)
		prefix = (struct in6_addr *)rinfo->prefix;
	else {
		/* this function is safe */
		ipv6_addr_prefix(&prefix_buf,
				 (struct in6_addr *)rinfo->prefix,
				 rinfo->prefix_len);
		prefix = &prefix_buf;
	}

837
	if (rinfo->prefix_len == 0)
838
		rt = rt6_get_dflt_router(net, gwaddr, dev);
839 840
	else
		rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
841
					gwaddr, dev);
842 843

	if (rt && !lifetime) {
844
		ip6_del_rt(net, rt);
845 846 847 848
		rt = NULL;
	}

	if (!rt && lifetime)
849 850
		rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
					dev, pref);
851
	else if (rt)
852 853
		rt->fib6_flags = RTF_ROUTEINFO |
				 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
854 855

	if (rt) {
856
		if (!addrconf_finite_timeout(lifetime))
857
			fib6_clean_expires(rt);
858
		else
859
			fib6_set_expires(rt, jiffies + HZ * lifetime);
860

861
		fib6_info_release(rt);
862 863 864 865 866
	}
	return 0;
}
#endif

867 868 869 870 871
/*
 *	Misc support functions
 */

/* called with rcu_lock held */
872
static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
873
{
874
	struct net_device *dev = rt->fib6_nh.nh_dev;
875

876
	if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
877 878 879 880 881
		/* for copies of local routes, dst->dev needs to be the
		 * device if it is a master device, the master device if
		 * device is enslaved, and the loopback as the default
		 */
		if (netif_is_l3_slave(dev) &&
882
		    !rt6_need_strict(&rt->fib6_dst.addr))
883 884 885 886 887 888 889 890 891 892 893
			dev = l3mdev_master_dev_rcu(dev);
		else if (!netif_is_l3_master(dev))
			dev = dev_net(dev)->loopback_dev;
		/* last case is netif_is_l3_master(dev) is true in which
		 * case we want dev returned to be dev
		 */
	}

	return dev;
}

894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913
static const int fib6_prop[RTN_MAX + 1] = {
	[RTN_UNSPEC]	= 0,
	[RTN_UNICAST]	= 0,
	[RTN_LOCAL]	= 0,
	[RTN_BROADCAST]	= 0,
	[RTN_ANYCAST]	= 0,
	[RTN_MULTICAST]	= 0,
	[RTN_BLACKHOLE]	= -EINVAL,
	[RTN_UNREACHABLE] = -EHOSTUNREACH,
	[RTN_PROHIBIT]	= -EACCES,
	[RTN_THROW]	= -EAGAIN,
	[RTN_NAT]	= -EINVAL,
	[RTN_XRESOLVE]	= -EINVAL,
};

static int ip6_rt_type_to_error(u8 fib6_type)
{
	return fib6_prop[fib6_type];
}

914
static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
915 916 917 918 919 920 921 922 923 924 925 926 927
{
	unsigned short flags = 0;

	if (rt->dst_nocount)
		flags |= DST_NOCOUNT;
	if (rt->dst_nopolicy)
		flags |= DST_NOPOLICY;
	if (rt->dst_host)
		flags |= DST_HOST;

	return flags;
}

928
static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949
{
	rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);

	switch (ort->fib6_type) {
	case RTN_BLACKHOLE:
		rt->dst.output = dst_discard_out;
		rt->dst.input = dst_discard;
		break;
	case RTN_PROHIBIT:
		rt->dst.output = ip6_pkt_prohibit_out;
		rt->dst.input = ip6_pkt_prohibit;
		break;
	case RTN_THROW:
	case RTN_UNREACHABLE:
	default:
		rt->dst.output = ip6_pkt_discard_out;
		rt->dst.input = ip6_pkt_discard;
		break;
	}
}

950
static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
951
{
952
	if (ort->fib6_flags & RTF_REJECT) {
953 954 955 956 957 958 959
		ip6_rt_init_dst_reject(rt, ort);
		return;
	}

	rt->dst.error = 0;
	rt->dst.output = ip6_output;

960
	if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
961
		rt->dst.input = ip6_input;
962
	} else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
963 964 965 966 967 968 969 970 971 972 973 974 975
		rt->dst.input = ip6_mc_input;
	} else {
		rt->dst.input = ip6_forward;
	}

	if (ort->fib6_nh.nh_lwtstate) {
		rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
		lwtunnel_set_redirect(&rt->dst);
	}

	rt->dst.lastuse = jiffies;
}

976
/* Caller must already hold reference to @from */
977
static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
978 979
{
	rt->rt6i_flags &= ~RTF_EXPIRES;
980
	rcu_assign_pointer(rt->from, from);
981
	dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
982 983 984 985
	if (from->fib6_metrics != &dst_default_metrics) {
		rt->dst._metrics |= DST_METRICS_REFCOUNTED;
		refcount_inc(&from->fib6_metrics->refcnt);
	}
986 987
}

988
/* Caller must already hold reference to @ort */
989
static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
990
{
D
David Ahern 已提交
991 992
	struct net_device *dev = fib6_info_nh_dev(ort);

993 994
	ip6_rt_init_dst(rt, ort);

995
	rt->rt6i_dst = ort->fib6_dst;
D
David Ahern 已提交
996
	rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
997
	rt->rt6i_gateway = ort->fib6_nh.nh_gw;
998
	rt->rt6i_flags = ort->fib6_flags;
999 1000
	rt6_set_from(rt, ort);
#ifdef CONFIG_IPV6_SUBTREES
1001
	rt->rt6i_src = ort->fib6_src;
1002
#endif
1003
	rt->rt6i_prefsrc = ort->fib6_prefsrc;
1004 1005
}

M
Martin KaFai Lau 已提交
1006 1007 1008
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
					struct in6_addr *saddr)
{
1009
	struct fib6_node *pn, *sn;
M
Martin KaFai Lau 已提交
1010 1011 1012
	while (1) {
		if (fn->fn_flags & RTN_TL_ROOT)
			return NULL;
1013 1014 1015
		pn = rcu_dereference(fn->parent);
		sn = FIB6_SUBTREE(pn);
		if (sn && sn != fn)
1016
			fn = fib6_node_lookup(sn, NULL, saddr);
M
Martin KaFai Lau 已提交
1017 1018 1019 1020 1021 1022
		else
			fn = pn;
		if (fn->fn_flags & RTN_RTINFO)
			return fn;
	}
}
T
Thomas Graf 已提交
1023

1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
			  bool null_fallback)
{
	struct rt6_info *rt = *prt;

	if (dst_hold_safe(&rt->dst))
		return true;
	if (null_fallback) {
		rt = net->ipv6.ip6_null_entry;
		dst_hold(&rt->dst);
	} else {
		rt = NULL;
	}
	*prt = rt;
	return false;
}

1041
/* called with rcu_lock held */
1042
static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1043
{
1044
	unsigned short flags = fib6_info_dst_flags(rt);
1045 1046 1047
	struct net_device *dev = rt->fib6_nh.nh_dev;
	struct rt6_info *nrt;

1048 1049 1050
	if (!fib6_info_hold_safe(rt))
		return NULL;

1051
	nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1052 1053
	if (nrt)
		ip6_rt_copy_init(nrt, rt);
1054 1055
	else
		fib6_info_release(rt);
1056 1057 1058 1059

	return nrt;
}

1060 1061
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
					     struct fib6_table *table,
D
David Ahern 已提交
1062 1063 1064
					     struct flowi6 *fl6,
					     const struct sk_buff *skb,
					     int flags)
L
Linus Torvalds 已提交
1065
{
1066
	struct fib6_info *f6i;
L
Linus Torvalds 已提交
1067
	struct fib6_node *fn;
1068
	struct rt6_info *rt;
L
Linus Torvalds 已提交
1069

1070 1071 1072
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		flags &= ~RT6_LOOKUP_F_IFACE;

1073
	rcu_read_lock();
1074
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
T
Thomas Graf 已提交
1075
restart:
1076 1077 1078
	f6i = rcu_dereference(fn->leaf);
	if (!f6i) {
		f6i = net->ipv6.fib6_null_entry;
1079
	} else {
1080
		f6i = rt6_device_match(net, f6i, &fl6->saddr,
1081
				      fl6->flowi6_oif, flags);
1082
		if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1083 1084 1085
			f6i = fib6_multipath_select(net, f6i, fl6,
						    fl6->flowi6_oif, skb,
						    flags);
1086
	}
1087
	if (f6i == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
1088 1089 1090 1091
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
	}
1092

1093
	trace_fib6_table_lookup(net, f6i, table, fl6);
1094

1095
	/* Search through exception table */
1096 1097
	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
	if (rt) {
1098 1099
		if (ip6_hold_safe(net, &rt, true))
			dst_use_noref(&rt->dst, jiffies);
1100
	} else if (f6i == net->ipv6.fib6_null_entry) {
1101 1102
		rt = net->ipv6.ip6_null_entry;
		dst_hold(&rt->dst);
1103 1104 1105 1106 1107 1108
	} else {
		rt = ip6_create_rt_rcu(f6i);
		if (!rt) {
			rt = net->ipv6.ip6_null_entry;
			dst_hold(&rt->dst);
		}
1109
	}
D
David Ahern 已提交
1110

1111
	rcu_read_unlock();
D
David Ahern 已提交
1112

T
Thomas Graf 已提交
1113 1114 1115
	return rt;
}

1116
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
D
David Ahern 已提交
1117
				   const struct sk_buff *skb, int flags)
F
Florian Westphal 已提交
1118
{
D
David Ahern 已提交
1119
	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
F
Florian Westphal 已提交
1120 1121 1122
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);

1123
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
D
David Ahern 已提交
1124 1125
			    const struct in6_addr *saddr, int oif,
			    const struct sk_buff *skb, int strict)
T
Thomas Graf 已提交
1126
{
1127 1128 1129
	struct flowi6 fl6 = {
		.flowi6_oif = oif,
		.daddr = *daddr,
T
Thomas Graf 已提交
1130 1131
	};
	struct dst_entry *dst;
1132
	int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
T
Thomas Graf 已提交
1133

1134
	if (saddr) {
1135
		memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1136 1137 1138
		flags |= RT6_LOOKUP_F_HAS_SADDR;
	}

D
David Ahern 已提交
1139
	dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
T
Thomas Graf 已提交
1140 1141 1142 1143 1144
	if (dst->error == 0)
		return (struct rt6_info *) dst;

	dst_release(dst);

L
Linus Torvalds 已提交
1145 1146
	return NULL;
}
1147 1148
EXPORT_SYMBOL(rt6_lookup);

T
Thomas Graf 已提交
1149
/* ip6_ins_rt is called with FREE table->tb6_lock.
1150 1151 1152
 * It takes new route entry, the addition fails by any reason the
 * route is released.
 * Caller must hold dst before calling it.
L
Linus Torvalds 已提交
1153 1154
 */

1155
static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1156
			struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
1157 1158
{
	int err;
T
Thomas Graf 已提交
1159
	struct fib6_table *table;
L
Linus Torvalds 已提交
1160

1161
	table = rt->fib6_table;
1162
	spin_lock_bh(&table->tb6_lock);
1163
	err = fib6_add(&table->tb6_root, rt, info, extack);
1164
	spin_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
1165 1166 1167 1168

	return err;
}

1169
int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1170
{
1171
	struct nl_info info = {	.nl_net = net, };
1172

1173
	return __ip6_ins_rt(rt, &info, NULL);
1174 1175
}

1176
static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1177 1178
					   const struct in6_addr *daddr,
					   const struct in6_addr *saddr)
L
Linus Torvalds 已提交
1179
{
1180
	struct net_device *dev;
L
Linus Torvalds 已提交
1181 1182 1183 1184 1185 1186
	struct rt6_info *rt;

	/*
	 *	Clone the route.
	 */

1187 1188 1189
	if (!fib6_info_hold_safe(ort))
		return NULL;

1190
	dev = ip6_rt_get_dev_rcu(ort);
1191
	rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1192 1193
	if (!rt) {
		fib6_info_release(ort);
M
Martin KaFai Lau 已提交
1194
		return NULL;
1195
	}
M
Martin KaFai Lau 已提交
1196 1197 1198 1199 1200 1201

	ip6_rt_copy_init(rt, ort);
	rt->rt6i_flags |= RTF_CACHE;
	rt->dst.flags |= DST_HOST;
	rt->rt6i_dst.addr = *daddr;
	rt->rt6i_dst.plen = 128;
L
Linus Torvalds 已提交
1202

M
Martin KaFai Lau 已提交
1203
	if (!rt6_is_gw_or_nonexthop(ort)) {
1204 1205
		if (ort->fib6_dst.plen != 128 &&
		    ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
M
Martin KaFai Lau 已提交
1206
			rt->rt6i_flags |= RTF_ANYCAST;
L
Linus Torvalds 已提交
1207
#ifdef CONFIG_IPV6_SUBTREES
M
Martin KaFai Lau 已提交
1208 1209 1210
		if (rt->rt6i_src.plen && saddr) {
			rt->rt6i_src.addr = *saddr;
			rt->rt6i_src.plen = 128;
1211
		}
M
Martin KaFai Lau 已提交
1212
#endif
1213
	}
L
Linus Torvalds 已提交
1214

1215 1216
	return rt;
}
L
Linus Torvalds 已提交
1217

1218
static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
M
Martin KaFai Lau 已提交
1219
{
1220
	unsigned short flags = fib6_info_dst_flags(rt);
1221
	struct net_device *dev;
M
Martin KaFai Lau 已提交
1222 1223
	struct rt6_info *pcpu_rt;

1224 1225 1226
	if (!fib6_info_hold_safe(rt))
		return NULL;

1227 1228
	rcu_read_lock();
	dev = ip6_rt_get_dev_rcu(rt);
1229
	pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1230
	rcu_read_unlock();
1231 1232
	if (!pcpu_rt) {
		fib6_info_release(rt);
M
Martin KaFai Lau 已提交
1233
		return NULL;
1234
	}
M
Martin KaFai Lau 已提交
1235 1236 1237 1238 1239
	ip6_rt_copy_init(pcpu_rt, rt);
	pcpu_rt->rt6i_flags |= RTF_PCPU;
	return pcpu_rt;
}

1240
/* It should be called with rcu_read_lock() acquired */
1241
static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
M
Martin KaFai Lau 已提交
1242
{
1243
	struct rt6_info *pcpu_rt, **p;
M
Martin KaFai Lau 已提交
1244 1245 1246 1247

	p = this_cpu_ptr(rt->rt6i_pcpu);
	pcpu_rt = *p;

1248 1249
	if (pcpu_rt)
		ip6_hold_safe(NULL, &pcpu_rt, false);
1250

1251 1252 1253
	return pcpu_rt;
}

1254
static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1255
					    struct fib6_info *rt)
1256 1257
{
	struct rt6_info *pcpu_rt, *prev, **p;
M
Martin KaFai Lau 已提交
1258 1259 1260

	pcpu_rt = ip6_rt_pcpu_alloc(rt);
	if (!pcpu_rt) {
1261 1262
		dst_hold(&net->ipv6.ip6_null_entry->dst);
		return net->ipv6.ip6_null_entry;
M
Martin KaFai Lau 已提交
1263 1264
	}

1265 1266 1267
	dst_hold(&pcpu_rt->dst);
	p = this_cpu_ptr(rt->rt6i_pcpu);
	prev = cmpxchg(p, NULL, pcpu_rt);
1268
	BUG_ON(prev);
1269

M
Martin KaFai Lau 已提交
1270 1271 1272
	return pcpu_rt;
}

1273 1274 1275 1276 1277 1278 1279 1280 1281 1282
/* exception hash table implementation
 */
static DEFINE_SPINLOCK(rt6_exception_lock);

/* Remove rt6_ex from hash table and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
				 struct rt6_exception *rt6_ex)
{
1283
	struct net *net;
W
Wei Wang 已提交
1284

1285 1286
	if (!bucket || !rt6_ex)
		return;
1287 1288

	net = dev_net(rt6_ex->rt6i->dst.dev);
1289
	hlist_del_rcu(&rt6_ex->hlist);
1290
	dst_release(&rt6_ex->rt6i->dst);
1291 1292 1293
	kfree_rcu(rt6_ex, rcu);
	WARN_ON_ONCE(!bucket->depth);
	bucket->depth--;
W
Wei Wang 已提交
1294
	net->ipv6.rt6_stats->fib_rt_cache--;
1295 1296 1297 1298 1299 1300 1301 1302 1303 1304 1305 1306 1307 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397
}

/* Remove oldest rt6_ex in bucket and free the memory
 * Caller must hold rt6_exception_lock
 */
static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
{
	struct rt6_exception *rt6_ex, *oldest = NULL;

	if (!bucket)
		return;

	hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
		if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
			oldest = rt6_ex;
	}
	rt6_remove_exception(bucket, oldest);
}

static u32 rt6_exception_hash(const struct in6_addr *dst,
			      const struct in6_addr *src)
{
	static u32 seed __read_mostly;
	u32 val;

	net_get_random_once(&seed, sizeof(seed));
	val = jhash(dst, sizeof(*dst), seed);

#ifdef CONFIG_IPV6_SUBTREES
	if (src)
		val = jhash(src, sizeof(*src), val);
#endif
	return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rt6_exception_lock
 */
static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
			      const struct in6_addr *daddr,
			      const struct in6_addr *saddr)
{
	struct rt6_exception *rt6_ex;
	u32 hval;

	if (!(*bucket) || !daddr)
		return NULL;

	hval = rt6_exception_hash(daddr, saddr);
	*bucket += hval;

	hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
		struct rt6_info *rt6 = rt6_ex->rt6i;
		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
		if (matched && saddr)
			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
		if (matched)
			return rt6_ex;
	}
	return NULL;
}

/* Helper function to find the cached rt in the hash table
 * and update bucket pointer to point to the bucket for this
 * (daddr, saddr) pair
 * Caller must hold rcu_read_lock()
 */
static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
			 const struct in6_addr *daddr,
			 const struct in6_addr *saddr)
{
	struct rt6_exception *rt6_ex;
	u32 hval;

	WARN_ON_ONCE(!rcu_read_lock_held());

	if (!(*bucket) || !daddr)
		return NULL;

	hval = rt6_exception_hash(daddr, saddr);
	*bucket += hval;

	hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
		struct rt6_info *rt6 = rt6_ex->rt6i;
		bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);

#ifdef CONFIG_IPV6_SUBTREES
		if (matched && saddr)
			matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
		if (matched)
			return rt6_ex;
	}
	return NULL;
}

1398
static unsigned int fib6_mtu(const struct fib6_info *rt)
1399 1400 1401
{
	unsigned int mtu;

D
David Ahern 已提交
1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413
	if (rt->fib6_pmtu) {
		mtu = rt->fib6_pmtu;
	} else {
		struct net_device *dev = fib6_info_nh_dev(rt);
		struct inet6_dev *idev;

		rcu_read_lock();
		idev = __in6_dev_get(dev);
		mtu = idev->cnf.mtu6;
		rcu_read_unlock();
	}

1414 1415 1416 1417 1418
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

	return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
}

1419
static int rt6_insert_exception(struct rt6_info *nrt,
1420
				struct fib6_info *ort)
1421
{
1422
	struct net *net = dev_net(nrt->dst.dev);
1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	int err = 0;

	spin_lock_bh(&rt6_exception_lock);

	if (ort->exception_bucket_flushed) {
		err = -EINVAL;
		goto out;
	}

	bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));
	if (!bucket) {
		bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
				 GFP_ATOMIC);
		if (!bucket) {
			err = -ENOMEM;
			goto out;
		}
		rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
	}

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates ort is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1454
	if (ort->fib6_src.plen)
1455 1456
		src_key = &nrt->rt6i_src.addr;
#endif
1457 1458 1459 1460

	/* Update rt6i_prefsrc as it could be changed
	 * in rt6_remove_prefsrc()
	 */
1461
	nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1462 1463 1464 1465
	/* rt6_mtu_change() might lower mtu on ort.
	 * Only insert this exception route if its mtu
	 * is less than ort's mtu value.
	 */
1466
	if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1467 1468 1469
		err = -EINVAL;
		goto out;
	}
1470

1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484
	rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
					       src_key);
	if (rt6_ex)
		rt6_remove_exception(bucket, rt6_ex);

	rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
	if (!rt6_ex) {
		err = -ENOMEM;
		goto out;
	}
	rt6_ex->rt6i = nrt;
	rt6_ex->stamp = jiffies;
	hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
	bucket->depth++;
W
Wei Wang 已提交
1485
	net->ipv6.rt6_stats->fib_rt_cache++;
1486 1487 1488 1489 1490 1491 1492 1493

	if (bucket->depth > FIB6_MAX_DEPTH)
		rt6_exception_remove_oldest(bucket);

out:
	spin_unlock_bh(&rt6_exception_lock);

	/* Update fn->fn_sernum to invalidate all cached dst */
1494
	if (!err) {
1495
		spin_lock_bh(&ort->fib6_table->tb6_lock);
1496
		fib6_update_sernum(net, ort);
1497
		spin_unlock_bh(&ort->fib6_table->tb6_lock);
1498 1499
		fib6_force_start_gc(net);
	}
1500 1501 1502 1503

	return err;
}

1504
void rt6_flush_exceptions(struct fib6_info *rt)
1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	spin_lock_bh(&rt6_exception_lock);
	/* Prevent rt6_insert_exception() to recreate the bucket list */
	rt->exception_bucket_flushed = 1;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));
	if (!bucket)
		goto out;

	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
		hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
			rt6_remove_exception(bucket, rt6_ex);
		WARN_ON_ONCE(bucket->depth);
		bucket++;
	}

out:
	spin_unlock_bh(&rt6_exception_lock);
}

/* Find cached rt in the hash table inside passed in rt
 * Caller has to hold rcu_read_lock()
 */
1534
static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551
					   struct in6_addr *daddr,
					   struct in6_addr *saddr)
{
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
	struct rt6_info *res = NULL;

	bucket = rcu_dereference(rt->rt6i_exception_bucket);

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates rt is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1552
	if (rt->fib6_src.plen)
1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563
		src_key = saddr;
#endif
	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);

	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
		res = rt6_ex->rt6i;

	return res;
}

/* Remove the passed in cached rt from the hash table that contains it */
1564
static int rt6_remove_exception_rt(struct rt6_info *rt)
1565 1566 1567 1568
{
	struct rt6_exception_bucket *bucket;
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;
1569
	struct fib6_info *from;
1570 1571
	int err;

1572
	from = rcu_dereference(rt->from);
1573
	if (!from ||
1574
	    !(rt->rt6i_flags & RTF_CACHE))
1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589
		return -EINVAL;

	if (!rcu_access_pointer(from->rt6i_exception_bucket))
		return -ENOENT;

	spin_lock_bh(&rt6_exception_lock);
	bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));
#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates 'from' is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1590
	if (from->fib6_src.plen)
1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612
		src_key = &rt->rt6i_src.addr;
#endif
	rt6_ex = __rt6_find_exception_spinlock(&bucket,
					       &rt->rt6i_dst.addr,
					       src_key);
	if (rt6_ex) {
		rt6_remove_exception(bucket, rt6_ex);
		err = 0;
	} else {
		err = -ENOENT;
	}

	spin_unlock_bh(&rt6_exception_lock);
	return err;
}

/* Find rt6_ex which contains the passed in rt cache and
 * refresh its stamp
 */
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
	struct rt6_exception_bucket *bucket;
1613
	struct fib6_info *from = rt->from;
1614 1615 1616 1617
	struct in6_addr *src_key = NULL;
	struct rt6_exception *rt6_ex;

	if (!from ||
1618
	    !(rt->rt6i_flags & RTF_CACHE))
1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630
		return;

	rcu_read_lock();
	bucket = rcu_dereference(from->rt6i_exception_bucket);

#ifdef CONFIG_IPV6_SUBTREES
	/* rt6i_src.plen != 0 indicates 'from' is in subtree
	 * and exception table is indexed by a hash of
	 * both rt6i_dst and rt6i_src.
	 * Otherwise, the exception table is indexed by
	 * a hash of only rt6i_dst.
	 */
1631
	if (from->fib6_src.plen)
1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642
		src_key = &rt->rt6i_src.addr;
#endif
	rt6_ex = __rt6_find_exception_rcu(&bucket,
					  &rt->rt6i_dst.addr,
					  src_key);
	if (rt6_ex)
		rt6_ex->stamp = jiffies;

	rcu_read_unlock();
}

1643
static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 1654 1655 1656 1657 1658 1659 1660 1661
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	int i;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
				rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
			}
			bucket++;
		}
	}
}

1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684
static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
					 struct rt6_info *rt, int mtu)
{
	/* If the new MTU is lower than the route PMTU, this new MTU will be the
	 * lowest MTU in the path: always allow updating the route PMTU to
	 * reflect PMTU decreases.
	 *
	 * If the new MTU is higher, and the route PMTU is equal to the local
	 * MTU, this means the old MTU is the lowest in the path, so allow
	 * updating it: if other nodes now have lower MTUs, PMTU discovery will
	 * handle this.
	 */

	if (dst_mtu(&rt->dst) >= mtu)
		return true;

	if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
		return true;

	return false;
}

static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1685
				       struct fib6_info *rt, int mtu)
1686 1687 1688 1689 1690 1691 1692 1693
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	int i;

	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
					lockdep_is_held(&rt6_exception_lock));

1694 1695 1696 1697 1698 1699 1700 1701
	if (!bucket)
		return;

	for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
		hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
			struct rt6_info *entry = rt6_ex->rt6i;

			/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1702
			 * route), the metrics of its rt->from have already
1703 1704
			 * been updated.
			 */
1705
			if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1706
			    rt6_mtu_change_route_allowed(idev, entry, mtu))
1707
				dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1708
		}
1709
		bucket++;
1710 1711 1712
	}
}

1713 1714
#define RTF_CACHE_GATEWAY	(RTF_GATEWAY | RTF_CACHE)

1715
static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1716 1717 1718 1719 1720 1721 1722 1723 1724 1725 1726 1727 1728 1729 1730 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749
					struct in6_addr *gateway)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
		return;

	spin_lock_bh(&rt6_exception_lock);
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				     lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry_safe(rt6_ex, tmp,
						  &bucket->chain, hlist) {
				struct rt6_info *entry = rt6_ex->rt6i;

				if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
				    RTF_CACHE_GATEWAY &&
				    ipv6_addr_equal(gateway,
						    &entry->rt6i_gateway)) {
					rt6_remove_exception(bucket, rt6_ex);
				}
			}
			bucket++;
		}
	}

	spin_unlock_bh(&rt6_exception_lock);
}

1750 1751 1752 1753 1754 1755 1756
static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
				      struct rt6_exception *rt6_ex,
				      struct fib6_gc_args *gc_args,
				      unsigned long now)
{
	struct rt6_info *rt = rt6_ex->rt6i;

1757 1758 1759 1760 1761 1762
	/* we are pruning and obsoleting aged-out and non gateway exceptions
	 * even if others have still references to them, so that on next
	 * dst_check() such references can be dropped.
	 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
	 * expired, independently from their aging, as per RFC 8201 section 4
	 */
W
Wei Wang 已提交
1763 1764 1765 1766 1767 1768 1769 1770
	if (!(rt->rt6i_flags & RTF_EXPIRES)) {
		if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
			RT6_TRACE("aging clone %p\n", rt);
			rt6_remove_exception(bucket, rt6_ex);
			return;
		}
	} else if (time_after(jiffies, rt->dst.expires)) {
		RT6_TRACE("purging expired route %p\n", rt);
1771 1772
		rt6_remove_exception(bucket, rt6_ex);
		return;
W
Wei Wang 已提交
1773 1774 1775
	}

	if (rt->rt6i_flags & RTF_GATEWAY) {
1776 1777 1778
		struct neighbour *neigh;
		__u8 neigh_flags = 0;

1779 1780
		neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
		if (neigh)
1781
			neigh_flags = neigh->flags;
1782

1783 1784 1785 1786 1787 1788 1789
		if (!(neigh_flags & NTF_ROUTER)) {
			RT6_TRACE("purging route %p via non-router but gateway\n",
				  rt);
			rt6_remove_exception(bucket, rt6_ex);
			return;
		}
	}
W
Wei Wang 已提交
1790

1791 1792 1793
	gc_args->more++;
}

1794
void rt6_age_exceptions(struct fib6_info *rt,
1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805
			struct fib6_gc_args *gc_args,
			unsigned long now)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct hlist_node *tmp;
	int i;

	if (!rcu_access_pointer(rt->rt6i_exception_bucket))
		return;

1806 1807
	rcu_read_lock_bh();
	spin_lock(&rt6_exception_lock);
1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820
	bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
				    lockdep_is_held(&rt6_exception_lock));

	if (bucket) {
		for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
			hlist_for_each_entry_safe(rt6_ex, tmp,
						  &bucket->chain, hlist) {
				rt6_age_examine_exception(bucket, rt6_ex,
							  gc_args, now);
			}
			bucket++;
		}
	}
1821 1822
	spin_unlock(&rt6_exception_lock);
	rcu_read_unlock_bh();
1823 1824
}

1825 1826 1827
/* must be called with rcu lock held */
struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
				    int oif, struct flowi6 *fl6, int strict)
L
Linus Torvalds 已提交
1828
{
1829
	struct fib6_node *fn, *saved_fn;
1830
	struct fib6_info *f6i;
L
Linus Torvalds 已提交
1831

1832
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1833
	saved_fn = fn;
L
Linus Torvalds 已提交
1834

D
David Ahern 已提交
1835 1836 1837
	if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
		oif = 0;

M
Martin KaFai Lau 已提交
1838
redo_rt6_select:
1839 1840
	f6i = rt6_select(net, fn, oif, strict);
	if (f6i == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
1841 1842 1843
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto redo_rt6_select;
1844 1845 1846 1847 1848 1849
		else if (strict & RT6_LOOKUP_F_REACHABLE) {
			/* also consider unreachable route */
			strict &= ~RT6_LOOKUP_F_REACHABLE;
			fn = saved_fn;
			goto redo_rt6_select;
		}
M
Martin KaFai Lau 已提交
1850 1851
	}

1852
	trace_fib6_table_lookup(net, f6i, table, fl6);
1853

1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875
	return f6i;
}

struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
			       int oif, struct flowi6 *fl6,
			       const struct sk_buff *skb, int flags)
{
	struct fib6_info *f6i;
	struct rt6_info *rt;
	int strict = 0;

	strict |= flags & RT6_LOOKUP_F_IFACE;
	strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
	if (net->ipv6.devconf_all->forwarding == 0)
		strict |= RT6_LOOKUP_F_REACHABLE;

	rcu_read_lock();

	f6i = fib6_table_lookup(net, table, oif, fl6, strict);
	if (f6i->fib6_nsiblings)
		f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);

1876
	if (f6i == net->ipv6.fib6_null_entry) {
D
David Ahern 已提交
1877
		rt = net->ipv6.ip6_null_entry;
1878
		rcu_read_unlock();
1879 1880
		dst_hold(&rt->dst);
		return rt;
1881 1882 1883 1884 1885
	}

	/*Search through exception table */
	rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
	if (rt) {
1886
		if (ip6_hold_safe(net, &rt, true))
1887
			dst_use_noref(&rt->dst, jiffies);
1888

1889
		rcu_read_unlock();
M
Martin KaFai Lau 已提交
1890
		return rt;
1891
	} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1892
			    !(f6i->fib6_flags & RTF_GATEWAY))) {
1893 1894 1895 1896 1897 1898 1899
		/* Create a RTF_CACHE clone which will not be
		 * owned by the fib6 tree.  It is for the special case where
		 * the daddr in the skb during the neighbor look-up is different
		 * from the fl6->daddr used to look-up route here.
		 */
		struct rt6_info *uncached_rt;

1900
		uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
M
Martin KaFai Lau 已提交
1901

1902
		rcu_read_unlock();
T
Thomas Graf 已提交
1903

1904 1905 1906 1907
		if (uncached_rt) {
			/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
			 * No need for another dst_hold()
			 */
1908
			rt6_uncached_list_add(uncached_rt);
W
Wei Wang 已提交
1909
			atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1910
		} else {
1911
			uncached_rt = net->ipv6.ip6_null_entry;
1912 1913
			dst_hold(&uncached_rt->dst);
		}
D
David Ahern 已提交
1914

1915
		return uncached_rt;
M
Martin KaFai Lau 已提交
1916 1917 1918 1919 1920
	} else {
		/* Get a percpu copy */

		struct rt6_info *pcpu_rt;

1921
		local_bh_disable();
1922
		pcpu_rt = rt6_get_pcpu_route(f6i);
M
Martin KaFai Lau 已提交
1923

1924 1925 1926
		if (!pcpu_rt)
			pcpu_rt = rt6_make_pcpu_route(net, f6i);

1927 1928
		local_bh_enable();
		rcu_read_unlock();
1929

M
Martin KaFai Lau 已提交
1930 1931
		return pcpu_rt;
	}
L
Linus Torvalds 已提交
1932
}
1933
EXPORT_SYMBOL_GPL(ip6_pol_route);
L
Linus Torvalds 已提交
1934

D
David Ahern 已提交
1935 1936 1937 1938 1939
static struct rt6_info *ip6_pol_route_input(struct net *net,
					    struct fib6_table *table,
					    struct flowi6 *fl6,
					    const struct sk_buff *skb,
					    int flags)
1940
{
D
David Ahern 已提交
1941
	return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1942 1943
}

1944 1945
struct dst_entry *ip6_route_input_lookup(struct net *net,
					 struct net_device *dev,
D
David Ahern 已提交
1946 1947 1948
					 struct flowi6 *fl6,
					 const struct sk_buff *skb,
					 int flags)
1949 1950 1951 1952
{
	if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
		flags |= RT6_LOOKUP_F_IFACE;

D
David Ahern 已提交
1953
	return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
1954
}
1955
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1956

1957
static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1958 1959
				  struct flow_keys *keys,
				  struct flow_keys *flkeys)
1960 1961 1962
{
	const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
	const struct ipv6hdr *key_iph = outer_iph;
1963
	struct flow_keys *_flkeys = flkeys;
1964 1965 1966
	const struct ipv6hdr *inner_iph;
	const struct icmp6hdr *icmph;
	struct ipv6hdr _inner_iph;
1967
	struct icmp6hdr _icmph;
1968 1969 1970 1971

	if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
		goto out;

1972 1973 1974 1975 1976
	icmph = skb_header_pointer(skb, skb_transport_offset(skb),
				   sizeof(_icmph), &_icmph);
	if (!icmph)
		goto out;

1977 1978 1979 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989
	if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
	    icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
	    icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
	    icmph->icmp6_type != ICMPV6_PARAMPROB)
		goto out;

	inner_iph = skb_header_pointer(skb,
				       skb_transport_offset(skb) + sizeof(*icmph),
				       sizeof(_inner_iph), &_inner_iph);
	if (!inner_iph)
		goto out;

	key_iph = inner_iph;
1990
	_flkeys = NULL;
1991
out:
1992 1993 1994 1995 1996 1997 1998 1999
	if (_flkeys) {
		keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
		keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
		keys->tags.flow_label = _flkeys->tags.flow_label;
		keys->basic.ip_proto = _flkeys->basic.ip_proto;
	} else {
		keys->addrs.v6addrs.src = key_iph->saddr;
		keys->addrs.v6addrs.dst = key_iph->daddr;
2000
		keys->tags.flow_label = ip6_flowlabel(key_iph);
2001 2002
		keys->basic.ip_proto = key_iph->nexthdr;
	}
2003 2004 2005
}

/* if skb is set it will be used and fl6 can be NULL */
2006 2007
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
		       const struct sk_buff *skb, struct flow_keys *flkeys)
2008 2009
{
	struct flow_keys hash_keys;
2010
	u32 mhash;
2011

2012
	switch (ip6_multipath_hash_policy(net)) {
2013 2014 2015 2016 2017 2018 2019 2020
	case 0:
		memset(&hash_keys, 0, sizeof(hash_keys));
		hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
		if (skb) {
			ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
		} else {
			hash_keys.addrs.v6addrs.src = fl6->saddr;
			hash_keys.addrs.v6addrs.dst = fl6->daddr;
2021
			hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2022 2023 2024 2025 2026 2027 2028 2029 2030 2031 2032 2033 2034 2035 2036 2037 2038 2039 2040 2041 2042 2043 2044 2045 2046 2047 2048 2049 2050 2051 2052 2053 2054 2055
			hash_keys.basic.ip_proto = fl6->flowi6_proto;
		}
		break;
	case 1:
		if (skb) {
			unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
			struct flow_keys keys;

			/* short-circuit if we already have L4 hash present */
			if (skb->l4_hash)
				return skb_get_hash_raw(skb) >> 1;

			memset(&hash_keys, 0, sizeof(hash_keys));

                        if (!flkeys) {
				skb_flow_dissect_flow_keys(skb, &keys, flag);
				flkeys = &keys;
			}
			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
			hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
			hash_keys.ports.src = flkeys->ports.src;
			hash_keys.ports.dst = flkeys->ports.dst;
			hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
		} else {
			memset(&hash_keys, 0, sizeof(hash_keys));
			hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
			hash_keys.addrs.v6addrs.src = fl6->saddr;
			hash_keys.addrs.v6addrs.dst = fl6->daddr;
			hash_keys.ports.src = fl6->fl6_sport;
			hash_keys.ports.dst = fl6->fl6_dport;
			hash_keys.basic.ip_proto = fl6->flowi6_proto;
		}
		break;
2056
	}
2057
	mhash = flow_hash_from_keys(&hash_keys);
2058

2059
	return mhash >> 1;
2060 2061
}

T
Thomas Graf 已提交
2062 2063
void ip6_route_input(struct sk_buff *skb)
{
2064
	const struct ipv6hdr *iph = ipv6_hdr(skb);
2065
	struct net *net = dev_net(skb->dev);
2066
	int flags = RT6_LOOKUP_F_HAS_SADDR;
2067
	struct ip_tunnel_info *tun_info;
2068
	struct flowi6 fl6 = {
2069
		.flowi6_iif = skb->dev->ifindex,
2070 2071
		.daddr = iph->daddr,
		.saddr = iph->saddr,
2072
		.flowlabel = ip6_flowinfo(iph),
2073 2074
		.flowi6_mark = skb->mark,
		.flowi6_proto = iph->nexthdr,
T
Thomas Graf 已提交
2075
	};
2076
	struct flow_keys *flkeys = NULL, _flkeys;
2077

2078
	tun_info = skb_tunnel_info(skb);
2079
	if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2080
		fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2081 2082 2083 2084

	if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
		flkeys = &_flkeys;

2085
	if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2086
		fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2087
	skb_dst_drop(skb);
D
David Ahern 已提交
2088 2089
	skb_dst_set(skb,
		    ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
T
Thomas Graf 已提交
2090 2091
}

D
David Ahern 已提交
2092 2093 2094 2095 2096
static struct rt6_info *ip6_pol_route_output(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
					     const struct sk_buff *skb,
					     int flags)
L
Linus Torvalds 已提交
2097
{
D
David Ahern 已提交
2098
	return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
T
Thomas Graf 已提交
2099 2100
}

2101 2102
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
					 struct flowi6 *fl6, int flags)
T
Thomas Graf 已提交
2103
{
2104
	bool any_src;
T
Thomas Graf 已提交
2105

2106 2107 2108 2109 2110 2111 2112
	if (rt6_need_strict(&fl6->daddr)) {
		struct dst_entry *dst;

		dst = l3mdev_link_scope_lookup(net, fl6);
		if (dst)
			return dst;
	}
D
David Ahern 已提交
2113

2114
	fl6->flowi6_iif = LOOPBACK_IFINDEX;
2115

2116
	any_src = ipv6_addr_any(&fl6->saddr);
2117
	if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2118
	    (fl6->flowi6_oif && any_src))
2119
		flags |= RT6_LOOKUP_F_IFACE;
T
Thomas Graf 已提交
2120

2121
	if (!any_src)
2122
		flags |= RT6_LOOKUP_F_HAS_SADDR;
2123 2124
	else if (sk)
		flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2125

D
David Ahern 已提交
2126
	return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
L
Linus Torvalds 已提交
2127
}
2128
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
L
Linus Torvalds 已提交
2129

2130
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2131
{
2132
	struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2133
	struct net_device *loopback_dev = net->loopback_dev;
2134 2135
	struct dst_entry *new = NULL;

2136
	rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2137
		       DST_OBSOLETE_DEAD, 0);
2138
	if (rt) {
2139
		rt6_info_init(rt);
W
Wei Wang 已提交
2140
		atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2141

2142
		new = &rt->dst;
2143
		new->__use = 1;
2144
		new->input = dst_discard;
E
Eric W. Biederman 已提交
2145
		new->output = dst_discard_out;
2146

2147
		dst_copy_metrics(new, &ort->dst);
2148

2149
		rt->rt6i_idev = in6_dev_get(loopback_dev);
A
Alexey Dobriyan 已提交
2150
		rt->rt6i_gateway = ort->rt6i_gateway;
2151
		rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2152 2153 2154 2155 2156 2157 2158

		memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
		memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
	}

2159 2160
	dst_release(dst_orig);
	return new ? new : ERR_PTR(-ENOMEM);
2161 2162
}

L
Linus Torvalds 已提交
2163 2164 2165 2166
/*
 *	Destination cache support functions
 */

2167
static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2168
{
2169 2170
	u32 rt_cookie = 0;

2171
	if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2172 2173 2174 2175 2176 2177
		return false;

	if (fib6_check_expired(f6i))
		return false;

	return true;
2178 2179
}

2180 2181 2182
static struct dst_entry *rt6_check(struct rt6_info *rt,
				   struct fib6_info *from,
				   u32 cookie)
2183
{
2184
	u32 rt_cookie = 0;
2185

2186
	if ((from && !fib6_get_cookie_safe(from, &rt_cookie)) ||
2187
	    rt_cookie != cookie)
2188 2189 2190 2191 2192 2193 2194 2195
		return NULL;

	if (rt6_check_expired(rt))
		return NULL;

	return &rt->dst;
}

2196 2197 2198
static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
					    struct fib6_info *from,
					    u32 cookie)
2199
{
2200 2201
	if (!__rt6_check_expired(rt) &&
	    rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2202
	    fib6_check(from, cookie))
2203 2204 2205 2206 2207
		return &rt->dst;
	else
		return NULL;
}

L
Linus Torvalds 已提交
2208 2209
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
2210
	struct dst_entry *dst_ret;
2211
	struct fib6_info *from;
L
Linus Torvalds 已提交
2212 2213
	struct rt6_info *rt;

2214 2215 2216
	rt = container_of(dst, struct rt6_info, dst);

	rcu_read_lock();
L
Linus Torvalds 已提交
2217

2218 2219 2220 2221
	/* All IPV6 dsts are created with ->obsolete set to the value
	 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
	 * into this function always.
	 */
2222

2223
	from = rcu_dereference(rt->from);
2224

2225 2226 2227
	if (from && (rt->rt6i_flags & RTF_PCPU ||
	    unlikely(!list_empty(&rt->rt6i_uncached))))
		dst_ret = rt6_dst_from_check(rt, from, cookie);
2228
	else
2229
		dst_ret = rt6_check(rt, from, cookie);
2230 2231 2232 2233

	rcu_read_unlock();

	return dst_ret;
L
Linus Torvalds 已提交
2234 2235 2236 2237 2238 2239 2240
}

static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
	struct rt6_info *rt = (struct rt6_info *) dst;

	if (rt) {
2241
		if (rt->rt6i_flags & RTF_CACHE) {
2242
			rcu_read_lock();
2243
			if (rt6_check_expired(rt)) {
2244
				rt6_remove_exception_rt(rt);
2245 2246
				dst = NULL;
			}
2247
			rcu_read_unlock();
2248
		} else {
L
Linus Torvalds 已提交
2249
			dst_release(dst);
2250 2251
			dst = NULL;
		}
L
Linus Torvalds 已提交
2252
	}
2253
	return dst;
L
Linus Torvalds 已提交
2254 2255 2256 2257 2258 2259
}

static void ip6_link_failure(struct sk_buff *skb)
{
	struct rt6_info *rt;

2260
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
L
Linus Torvalds 已提交
2261

E
Eric Dumazet 已提交
2262
	rt = (struct rt6_info *) skb_dst(skb);
L
Linus Torvalds 已提交
2263
	if (rt) {
2264
		rcu_read_lock();
2265
		if (rt->rt6i_flags & RTF_CACHE) {
W
Wei Wang 已提交
2266
			if (dst_hold_safe(&rt->dst))
2267
				rt6_remove_exception_rt(rt);
2268
		} else {
2269
			struct fib6_info *from;
2270 2271
			struct fib6_node *fn;

2272 2273 2274 2275 2276 2277
			from = rcu_dereference(rt->from);
			if (from) {
				fn = rcu_dereference(from->fib6_node);
				if (fn && (rt->rt6i_flags & RTF_DEFAULT))
					fn->fn_sernum = -1;
			}
2278
		}
2279
		rcu_read_unlock();
L
Linus Torvalds 已提交
2280 2281 2282
	}
}

2283 2284
static void rt6_update_expires(struct rt6_info *rt0, int timeout)
{
2285 2286 2287 2288 2289 2290 2291 2292 2293
	if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
		struct fib6_info *from;

		rcu_read_lock();
		from = rcu_dereference(rt0->from);
		if (from)
			rt0->dst.expires = from->expires;
		rcu_read_unlock();
	}
2294 2295 2296 2297 2298

	dst_set_expires(&rt0->dst, timeout);
	rt0->rt6i_flags |= RTF_EXPIRES;
}

2299 2300 2301 2302
static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
	struct net *net = dev_net(rt->dst.dev);

2303
	dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2304 2305 2306 2307
	rt->rt6i_flags |= RTF_MODIFIED;
	rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}

2308 2309
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
2310 2311 2312 2313 2314 2315
	bool from_set;

	rcu_read_lock();
	from_set = !!rcu_dereference(rt->from);
	rcu_read_unlock();

2316
	return !(rt->rt6i_flags & RTF_CACHE) &&
2317
		(rt->rt6i_flags & RTF_PCPU || from_set);
2318 2319
}

2320 2321
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
				 const struct ipv6hdr *iph, u32 mtu)
L
Linus Torvalds 已提交
2322
{
2323
	const struct in6_addr *daddr, *saddr;
2324
	struct rt6_info *rt6 = (struct rt6_info *)dst;
L
Linus Torvalds 已提交
2325

2326 2327 2328
	if (dst_metric_locked(dst, RTAX_MTU))
		return;

2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339
	if (iph) {
		daddr = &iph->daddr;
		saddr = &iph->saddr;
	} else if (sk) {
		daddr = &sk->sk_v6_daddr;
		saddr = &inet6_sk(sk)->saddr;
	} else {
		daddr = NULL;
		saddr = NULL;
	}
	dst_confirm_neigh(dst, daddr);
2340 2341 2342
	mtu = max_t(u32, mtu, IPV6_MIN_MTU);
	if (mtu >= dst_mtu(dst))
		return;
2343

2344
	if (!rt6_cache_allowed_for_pmtu(rt6)) {
2345
		rt6_do_update_pmtu(rt6, mtu);
2346 2347 2348
		/* update rt6_ex->stamp for cache */
		if (rt6->rt6i_flags & RTF_CACHE)
			rt6_update_exception_stamp_rt(rt6);
2349
	} else if (daddr) {
2350
		struct fib6_info *from;
2351 2352
		struct rt6_info *nrt6;

2353
		rcu_read_lock();
2354 2355
		from = rcu_dereference(rt6->from);
		nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2356 2357
		if (nrt6) {
			rt6_do_update_pmtu(nrt6, mtu);
2358
			if (rt6_insert_exception(nrt6, from))
2359
				dst_release_immediate(&nrt6->dst);
2360
		}
2361
		rcu_read_unlock();
L
Linus Torvalds 已提交
2362 2363 2364
	}
}

2365 2366 2367 2368 2369 2370
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
			       struct sk_buff *skb, u32 mtu)
{
	__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
}

2371
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2372
		     int oif, u32 mark, kuid_t uid)
2373 2374 2375 2376 2377 2378 2379
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
	fl6.flowi6_oif = oif;
2380
	fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2381 2382
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
2383
	fl6.flowlabel = ip6_flowinfo(iph);
2384
	fl6.flowi6_uid = uid;
2385 2386 2387

	dst = ip6_route_output(net, NULL, &fl6);
	if (!dst->error)
2388
		__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
2389 2390 2391 2392 2393 2394
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);

void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
2395 2396
	struct dst_entry *dst;

2397
	ip6_update_pmtu(skb, sock_net(sk), mtu,
2398
			sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
2399 2400 2401 2402 2403 2404 2405 2406 2407 2408

	dst = __sk_dst_get(sk);
	if (!dst || !dst->obsolete ||
	    dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
		return;

	bh_lock_sock(sk);
	if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
		ip6_datagram_dst_update(sk, false);
	bh_unlock_sock(sk);
2409 2410 2411
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);

2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428
void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
			   const struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_SUBTREES
	struct ipv6_pinfo *np = inet6_sk(sk);
#endif

	ip6_dst_store(sk, dst,
		      ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
		      &sk->sk_v6_daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
		      ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
		      &np->saddr :
#endif
		      NULL);
}

2429 2430 2431 2432 2433 2434 2435 2436 2437
/* Handle redirects */
struct ip6rd_flowi {
	struct flowi6 fl6;
	struct in6_addr gateway;
};

static struct rt6_info *__ip6_route_redirect(struct net *net,
					     struct fib6_table *table,
					     struct flowi6 *fl6,
D
David Ahern 已提交
2438
					     const struct sk_buff *skb,
2439 2440 2441
					     int flags)
{
	struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2442
	struct rt6_info *ret = NULL, *rt_cache;
2443
	struct fib6_info *rt;
2444 2445 2446
	struct fib6_node *fn;

	/* Get the "current" route for this destination and
A
Alexander Alemayhu 已提交
2447
	 * check if the redirect has come from appropriate router.
2448 2449 2450 2451 2452 2453 2454 2455
	 *
	 * RFC 4861 specifies that redirects should only be
	 * accepted if they come from the nexthop to the target.
	 * Due to the way the routes are chosen, this notion
	 * is a bit fuzzy and one might need to check all possible
	 * routes.
	 */

2456
	rcu_read_lock();
2457
	fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2458
restart:
2459
	for_each_fib6_node_rt_rcu(fn) {
2460
		if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2461
			continue;
2462
		if (fib6_check_expired(rt))
2463
			continue;
2464
		if (rt->fib6_flags & RTF_REJECT)
2465
			break;
2466
		if (!(rt->fib6_flags & RTF_GATEWAY))
2467
			continue;
2468
		if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2469
			continue;
2470 2471 2472 2473 2474
		/* rt_cache's gateway might be different from its 'parent'
		 * in the case of an ip redirect.
		 * So we keep searching in the exception table if the gateway
		 * is different.
		 */
2475
		if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2476 2477 2478 2479 2480 2481
			rt_cache = rt6_find_cached_rt(rt,
						      &fl6->daddr,
						      &fl6->saddr);
			if (rt_cache &&
			    ipv6_addr_equal(&rdfl->gateway,
					    &rt_cache->rt6i_gateway)) {
2482
				ret = rt_cache;
2483 2484
				break;
			}
2485
			continue;
2486
		}
2487 2488 2489 2490
		break;
	}

	if (!rt)
D
David Ahern 已提交
2491
		rt = net->ipv6.fib6_null_entry;
2492
	else if (rt->fib6_flags & RTF_REJECT) {
2493
		ret = net->ipv6.ip6_null_entry;
2494 2495 2496
		goto out;
	}

D
David Ahern 已提交
2497
	if (rt == net->ipv6.fib6_null_entry) {
M
Martin KaFai Lau 已提交
2498 2499 2500
		fn = fib6_backtrack(fn, &fl6->saddr);
		if (fn)
			goto restart;
2501
	}
M
Martin KaFai Lau 已提交
2502

2503
out:
2504
	if (ret)
2505
		ip6_hold_safe(net, &ret, true);
2506 2507
	else
		ret = ip6_create_rt_rcu(rt);
2508

2509
	rcu_read_unlock();
2510

2511
	trace_fib6_table_lookup(net, rt, table, fl6);
2512
	return ret;
2513 2514 2515
};

static struct dst_entry *ip6_route_redirect(struct net *net,
D
David Ahern 已提交
2516 2517 2518
					    const struct flowi6 *fl6,
					    const struct sk_buff *skb,
					    const struct in6_addr *gateway)
2519 2520 2521 2522 2523 2524 2525
{
	int flags = RT6_LOOKUP_F_HAS_SADDR;
	struct ip6rd_flowi rdfl;

	rdfl.fl6 = *fl6;
	rdfl.gateway = *gateway;

D
David Ahern 已提交
2526
	return fib6_rule_lookup(net, &rdfl.fl6, skb,
2527 2528 2529
				flags, __ip6_route_redirect);
}

2530 2531
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
		  kuid_t uid)
2532 2533 2534 2535 2536 2537
{
	const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
2538
	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2539 2540 2541 2542
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.daddr = iph->daddr;
	fl6.saddr = iph->saddr;
2543
	fl6.flowlabel = ip6_flowinfo(iph);
2544
	fl6.flowi6_uid = uid;
2545

D
David Ahern 已提交
2546
	dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2547
	rt6_do_redirect(dst, NULL, skb);
2548 2549 2550 2551
	dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);

2552 2553 2554 2555 2556 2557 2558 2559 2560
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
			    u32 mark)
{
	const struct ipv6hdr *iph = ipv6_hdr(skb);
	const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
	struct dst_entry *dst;
	struct flowi6 fl6;

	memset(&fl6, 0, sizeof(fl6));
2561
	fl6.flowi6_iif = LOOPBACK_IFINDEX;
2562 2563 2564 2565
	fl6.flowi6_oif = oif;
	fl6.flowi6_mark = mark;
	fl6.daddr = msg->dest;
	fl6.saddr = iph->daddr;
2566
	fl6.flowi6_uid = sock_net_uid(net, NULL);
2567

D
David Ahern 已提交
2568
	dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2569
	rt6_do_redirect(dst, NULL, skb);
2570 2571 2572
	dst_release(dst);
}

2573 2574
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
2575 2576
	ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
		     sk->sk_uid);
2577 2578 2579
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);

2580
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
L
Linus Torvalds 已提交
2581
{
2582 2583 2584 2585
	struct net_device *dev = dst->dev;
	unsigned int mtu = dst_mtu(dst);
	struct net *net = dev_net(dev);

L
Linus Torvalds 已提交
2586 2587
	mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);

2588 2589
	if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
		mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
L
Linus Torvalds 已提交
2590 2591

	/*
2592 2593 2594
	 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
	 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
	 * IPV6_MAXPLEN is also valid and means: "any MSS,
L
Linus Torvalds 已提交
2595 2596 2597 2598 2599 2600 2601
	 * rely only on pmtu discovery"
	 */
	if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
		mtu = IPV6_MAXPLEN;
	return mtu;
}

2602
static unsigned int ip6_mtu(const struct dst_entry *dst)
2603 2604
{
	struct inet6_dev *idev;
2605
	unsigned int mtu;
2606 2607

	mtu = dst_metric_raw(dst, RTAX_MTU);
2608
	if (mtu)
E
Eric Dumazet 已提交
2609
		goto out;
2610 2611

	mtu = IPV6_MIN_MTU;
2612 2613 2614 2615 2616 2617 2618

	rcu_read_lock();
	idev = __in6_dev_get(dst->dev);
	if (idev)
		mtu = idev->cnf.mtu6;
	rcu_read_unlock();

E
Eric Dumazet 已提交
2619
out:
2620 2621 2622
	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);

	return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2623 2624
}

2625 2626 2627 2628 2629 2630 2631 2632 2633 2634 2635 2636 2637 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647 2648 2649 2650 2651 2652 2653 2654 2655 2656 2657 2658 2659 2660 2661 2662 2663 2664 2665 2666 2667 2668 2669 2670 2671 2672
/* MTU selection:
 * 1. mtu on route is locked - use it
 * 2. mtu from nexthop exception
 * 3. mtu from egress device
 *
 * based on ip6_dst_mtu_forward and exception logic of
 * rt6_find_cached_rt; called with rcu_read_lock
 */
u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
		      struct in6_addr *saddr)
{
	struct rt6_exception_bucket *bucket;
	struct rt6_exception *rt6_ex;
	struct in6_addr *src_key;
	struct inet6_dev *idev;
	u32 mtu = 0;

	if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
		mtu = f6i->fib6_pmtu;
		if (mtu)
			goto out;
	}

	src_key = NULL;
#ifdef CONFIG_IPV6_SUBTREES
	if (f6i->fib6_src.plen)
		src_key = saddr;
#endif

	bucket = rcu_dereference(f6i->rt6i_exception_bucket);
	rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
	if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
		mtu = dst_metric_raw(&rt6_ex->rt6i->dst, RTAX_MTU);

	if (likely(!mtu)) {
		struct net_device *dev = fib6_info_nh_dev(f6i);

		mtu = IPV6_MIN_MTU;
		idev = __in6_dev_get(dev);
		if (idev && idev->cnf.mtu6 > mtu)
			mtu = idev->cnf.mtu6;
	}

	mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
out:
	return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
}

2673
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2674
				  struct flowi6 *fl6)
L
Linus Torvalds 已提交
2675
{
2676
	struct dst_entry *dst;
L
Linus Torvalds 已提交
2677 2678
	struct rt6_info *rt;
	struct inet6_dev *idev = in6_dev_get(dev);
2679
	struct net *net = dev_net(dev);
L
Linus Torvalds 已提交
2680

2681
	if (unlikely(!idev))
E
Eric Dumazet 已提交
2682
		return ERR_PTR(-ENODEV);
L
Linus Torvalds 已提交
2683

2684
	rt = ip6_dst_alloc(net, dev, 0);
2685
	if (unlikely(!rt)) {
L
Linus Torvalds 已提交
2686
		in6_dev_put(idev);
2687
		dst = ERR_PTR(-ENOMEM);
L
Linus Torvalds 已提交
2688 2689 2690
		goto out;
	}

2691
	rt->dst.flags |= DST_HOST;
2692
	rt->dst.input = ip6_input;
2693
	rt->dst.output  = ip6_output;
2694
	rt->rt6i_gateway  = fl6->daddr;
2695
	rt->rt6i_dst.addr = fl6->daddr;
2696 2697
	rt->rt6i_dst.plen = 128;
	rt->rt6i_idev     = idev;
L
Li RongQing 已提交
2698
	dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
L
Linus Torvalds 已提交
2699

2700
	/* Add this dst into uncached_list so that rt6_disable_ip() can
2701 2702 2703
	 * do proper release of the net_device
	 */
	rt6_uncached_list_add(rt);
W
Wei Wang 已提交
2704
	atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
L
Linus Torvalds 已提交
2705

2706 2707
	dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);

L
Linus Torvalds 已提交
2708
out:
2709
	return dst;
L
Linus Torvalds 已提交
2710 2711
}

2712
static int ip6_dst_gc(struct dst_ops *ops)
L
Linus Torvalds 已提交
2713
{
2714
	struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2715 2716 2717 2718 2719
	int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
	int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
	int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
	int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
	unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2720
	int entries;
2721

2722
	entries = dst_entries_get_fast(ops);
2723
	if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2724
	    entries <= rt_max_size)
L
Linus Torvalds 已提交
2725 2726
		goto out;

2727
	net->ipv6.ip6_rt_gc_expire++;
2728
	fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2729 2730
	entries = dst_entries_get_slow(ops);
	if (entries < ops->gc_thresh)
2731
		net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
L
Linus Torvalds 已提交
2732
out:
2733
	net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2734
	return entries > rt_max_size;
L
Linus Torvalds 已提交
2735 2736
}

2737
static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2738
			       struct fib6_config *cfg)
2739
{
2740
	struct dst_metrics *p;
2741

2742
	if (!cfg->fc_mx)
2743 2744
		return 0;

2745 2746
	p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
	if (unlikely(!p))
2747 2748
		return -ENOMEM;

2749 2750
	refcount_set(&p->refcnt, 1);
	rt->fib6_metrics = p;
2751

2752
	return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2753
}
L
Linus Torvalds 已提交
2754

2755 2756
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
					    struct fib6_config *cfg,
2757 2758
					    const struct in6_addr *gw_addr,
					    u32 tbid, int flags)
2759 2760 2761 2762 2763 2764 2765 2766 2767
{
	struct flowi6 fl6 = {
		.flowi6_oif = cfg->fc_ifindex,
		.daddr = *gw_addr,
		.saddr = cfg->fc_prefsrc,
	};
	struct fib6_table *table;
	struct rt6_info *rt;

2768
	table = fib6_get_table(net, tbid);
2769 2770 2771 2772 2773 2774
	if (!table)
		return NULL;

	if (!ipv6_addr_any(&cfg->fc_prefsrc))
		flags |= RT6_LOOKUP_F_HAS_SADDR;

2775
	flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
D
David Ahern 已提交
2776
	rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2777 2778 2779 2780 2781 2782 2783 2784 2785 2786

	/* if table lookup failed, fall back to full lookup */
	if (rt == net->ipv6.ip6_null_entry) {
		ip6_rt_put(rt);
		rt = NULL;
	}

	return rt;
}

2787 2788
static int ip6_route_check_nh_onlink(struct net *net,
				     struct fib6_config *cfg,
2789
				     const struct net_device *dev,
2790 2791
				     struct netlink_ext_ack *extack)
{
2792
	u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2793 2794 2795 2796 2797 2798 2799 2800
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
	struct rt6_info *grt;
	int err;

	err = 0;
	grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
	if (grt) {
2801 2802
		if (!grt->dst.error &&
		    (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2803 2804
			NL_SET_ERR_MSG(extack,
				       "Nexthop has invalid gateway or device mismatch");
2805 2806 2807 2808 2809 2810 2811 2812 2813
			err = -EINVAL;
		}

		ip6_rt_put(grt);
	}

	return err;
}

2814 2815 2816 2817 2818 2819 2820 2821 2822 2823 2824
static int ip6_route_check_nh(struct net *net,
			      struct fib6_config *cfg,
			      struct net_device **_dev,
			      struct inet6_dev **idev)
{
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	struct net_device *dev = _dev ? *_dev : NULL;
	struct rt6_info *grt = NULL;
	int err = -EHOSTUNREACH;

	if (cfg->fc_table) {
2825 2826 2827 2828
		int flags = RT6_LOOKUP_F_IFACE;

		grt = ip6_nh_lookup_table(net, cfg, gw_addr,
					  cfg->fc_table, flags);
2829 2830 2831 2832 2833 2834 2835 2836 2837 2838
		if (grt) {
			if (grt->rt6i_flags & RTF_GATEWAY ||
			    (dev && dev != grt->dst.dev)) {
				ip6_rt_put(grt);
				grt = NULL;
			}
		}
	}

	if (!grt)
D
David Ahern 已提交
2839
		grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2840 2841 2842 2843 2844 2845 2846 2847 2848 2849 2850 2851 2852 2853 2854 2855 2856 2857 2858 2859 2860 2861 2862 2863 2864

	if (!grt)
		goto out;

	if (dev) {
		if (dev != grt->dst.dev) {
			ip6_rt_put(grt);
			goto out;
		}
	} else {
		*_dev = dev = grt->dst.dev;
		*idev = grt->rt6i_idev;
		dev_hold(dev);
		in6_dev_hold(grt->rt6i_idev);
	}

	if (!(grt->rt6i_flags & RTF_GATEWAY))
		err = 0;

	ip6_rt_put(grt);

out:
	return err;
}

2865 2866 2867 2868 2869 2870
static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
			   struct net_device **_dev, struct inet6_dev **idev,
			   struct netlink_ext_ack *extack)
{
	const struct in6_addr *gw_addr = &cfg->fc_gateway;
	int gwa_type = ipv6_addr_type(gw_addr);
2871
	bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2872
	const struct net_device *dev = *_dev;
2873
	bool need_addr_check = !dev;
2874 2875 2876 2877 2878 2879 2880
	int err = -EINVAL;

	/* if gw_addr is local we will fail to detect this in case
	 * address is still TENTATIVE (DAD in progress). rt6_lookup()
	 * will return already-added prefix route via interface that
	 * prefix route was assigned to, which might be non-loopback.
	 */
2881 2882 2883
	if (dev &&
	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2884 2885 2886 2887 2888 2889 2890 2891 2892 2893 2894 2895 2896 2897 2898 2899 2900 2901 2902 2903 2904 2905 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922
		goto out;
	}

	if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
		/* IPv6 strictly inhibits using not link-local
		 * addresses as nexthop address.
		 * Otherwise, router will not able to send redirects.
		 * It is very good, but in some (rare!) circumstances
		 * (SIT, PtP, NBMA NOARP links) it is handy to allow
		 * some exceptions. --ANK
		 * We allow IPv4-mapped nexthops to support RFC4798-type
		 * addressing
		 */
		if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
			NL_SET_ERR_MSG(extack, "Invalid gateway address");
			goto out;
		}

		if (cfg->fc_flags & RTNH_F_ONLINK)
			err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
		else
			err = ip6_route_check_nh(net, cfg, _dev, idev);

		if (err)
			goto out;
	}

	/* reload in case device was changed */
	dev = *_dev;

	err = -EINVAL;
	if (!dev) {
		NL_SET_ERR_MSG(extack, "Egress device not specified");
		goto out;
	} else if (dev->flags & IFF_LOOPBACK) {
		NL_SET_ERR_MSG(extack,
			       "Egress device can not be loopback device for this route");
		goto out;
	}
2923 2924 2925 2926 2927 2928 2929 2930 2931 2932

	/* if we did not check gw_addr above, do so now that the
	 * egress device has been resolved.
	 */
	if (need_addr_check &&
	    ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
		NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
		goto out;
	}

2933 2934 2935 2936 2937
	err = 0;
out:
	return err;
}

2938
static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
2939
					      gfp_t gfp_flags,
2940
					      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
2941
{
2942
	struct net *net = cfg->fc_nlinfo.nl_net;
2943
	struct fib6_info *rt = NULL;
L
Linus Torvalds 已提交
2944 2945
	struct net_device *dev = NULL;
	struct inet6_dev *idev = NULL;
T
Thomas Graf 已提交
2946
	struct fib6_table *table;
L
Linus Torvalds 已提交
2947
	int addr_type;
2948
	int err = -EINVAL;
L
Linus Torvalds 已提交
2949

2950
	/* RTF_PCPU is an internal flag; can not be set by userspace */
2951 2952
	if (cfg->fc_flags & RTF_PCPU) {
		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
2953
		goto out;
2954
	}
2955

2956 2957 2958 2959 2960 2961
	/* RTF_CACHE is an internal flag; can not be set by userspace */
	if (cfg->fc_flags & RTF_CACHE) {
		NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
		goto out;
	}

2962 2963 2964 2965 2966
	if (cfg->fc_type > RTN_MAX) {
		NL_SET_ERR_MSG(extack, "Invalid route type");
		goto out;
	}

2967 2968 2969 2970 2971 2972
	if (cfg->fc_dst_len > 128) {
		NL_SET_ERR_MSG(extack, "Invalid prefix length");
		goto out;
	}
	if (cfg->fc_src_len > 128) {
		NL_SET_ERR_MSG(extack, "Invalid source address length");
2973
		goto out;
2974
	}
L
Linus Torvalds 已提交
2975
#ifndef CONFIG_IPV6_SUBTREES
2976 2977 2978
	if (cfg->fc_src_len) {
		NL_SET_ERR_MSG(extack,
			       "Specifying source address requires IPV6_SUBTREES to be enabled");
2979
		goto out;
2980
	}
L
Linus Torvalds 已提交
2981
#endif
2982
	if (cfg->fc_ifindex) {
L
Linus Torvalds 已提交
2983
		err = -ENODEV;
2984
		dev = dev_get_by_index(net, cfg->fc_ifindex);
L
Linus Torvalds 已提交
2985 2986 2987 2988 2989 2990 2991
		if (!dev)
			goto out;
		idev = in6_dev_get(dev);
		if (!idev)
			goto out;
	}

2992 2993
	if (cfg->fc_metric == 0)
		cfg->fc_metric = IP6_RT_PRIO_USER;
L
Linus Torvalds 已提交
2994

2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009
	if (cfg->fc_flags & RTNH_F_ONLINK) {
		if (!dev) {
			NL_SET_ERR_MSG(extack,
				       "Nexthop device required for onlink");
			err = -ENODEV;
			goto out;
		}

		if (!(dev->flags & IFF_UP)) {
			NL_SET_ERR_MSG(extack, "Nexthop device is not up");
			err = -ENETDOWN;
			goto out;
		}
	}

3010
	err = -ENOBUFS;
3011 3012
	if (cfg->fc_nlinfo.nlh &&
	    !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3013
		table = fib6_get_table(net, cfg->fc_table);
3014
		if (!table) {
3015
			pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3016 3017 3018 3019 3020
			table = fib6_new_table(net, cfg->fc_table);
		}
	} else {
		table = fib6_new_table(net, cfg->fc_table);
	}
3021 3022

	if (!table)
T
Thomas Graf 已提交
3023 3024
		goto out;

3025 3026 3027
	err = -ENOMEM;
	rt = fib6_info_alloc(gfp_flags);
	if (!rt)
L
Linus Torvalds 已提交
3028
		goto out;
3029 3030 3031

	if (cfg->fc_flags & RTF_ADDRCONF)
		rt->dst_nocount = true;
L
Linus Torvalds 已提交
3032

3033 3034
	err = ip6_convert_metrics(net, rt, cfg);
	if (err < 0)
L
Linus Torvalds 已提交
3035 3036
		goto out;

3037
	if (cfg->fc_flags & RTF_EXPIRES)
3038
		fib6_set_expires(rt, jiffies +
3039 3040
				clock_t_to_jiffies(cfg->fc_expires));
	else
3041
		fib6_clean_expires(rt);
L
Linus Torvalds 已提交
3042

3043 3044
	if (cfg->fc_protocol == RTPROT_UNSPEC)
		cfg->fc_protocol = RTPROT_BOOT;
3045
	rt->fib6_protocol = cfg->fc_protocol;
3046 3047

	addr_type = ipv6_addr_type(&cfg->fc_dst);
L
Linus Torvalds 已提交
3048

3049 3050 3051
	if (cfg->fc_encap) {
		struct lwtunnel_state *lwtstate;

3052
		err = lwtunnel_build_state(cfg->fc_encap_type,
3053
					   cfg->fc_encap, AF_INET6, cfg,
3054
					   &lwtstate, extack);
3055 3056
		if (err)
			goto out;
3057
		rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3058 3059
	}

3060 3061 3062
	ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
	rt->fib6_dst.plen = cfg->fc_dst_len;
	if (rt->fib6_dst.plen == 128)
3063
		rt->dst_host = true;
3064

L
Linus Torvalds 已提交
3065
#ifdef CONFIG_IPV6_SUBTREES
3066 3067
	ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
	rt->fib6_src.plen = cfg->fc_src_len;
L
Linus Torvalds 已提交
3068 3069
#endif

3070
	rt->fib6_metric = cfg->fc_metric;
3071
	rt->fib6_nh.nh_weight = 1;
L
Linus Torvalds 已提交
3072

3073
	rt->fib6_type = cfg->fc_type;
L
Linus Torvalds 已提交
3074 3075 3076 3077

	/* We cannot add true routes via loopback here,
	   they would result in kernel looping; promote them to reject routes
	 */
3078
	if ((cfg->fc_flags & RTF_REJECT) ||
3079 3080 3081
	    (dev && (dev->flags & IFF_LOOPBACK) &&
	     !(addr_type & IPV6_ADDR_LOOPBACK) &&
	     !(cfg->fc_flags & RTF_LOCAL))) {
L
Linus Torvalds 已提交
3082
		/* hold loopback dev/idev if we haven't done so. */
3083
		if (dev != net->loopback_dev) {
L
Linus Torvalds 已提交
3084 3085 3086 3087
			if (dev) {
				dev_put(dev);
				in6_dev_put(idev);
			}
3088
			dev = net->loopback_dev;
L
Linus Torvalds 已提交
3089 3090 3091 3092 3093 3094 3095
			dev_hold(dev);
			idev = in6_dev_get(dev);
			if (!idev) {
				err = -ENODEV;
				goto out;
			}
		}
3096
		rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
L
Linus Torvalds 已提交
3097 3098 3099
		goto install_route;
	}

3100
	if (cfg->fc_flags & RTF_GATEWAY) {
3101 3102
		err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
		if (err)
3103
			goto out;
L
Linus Torvalds 已提交
3104

3105
		rt->fib6_nh.nh_gw = cfg->fc_gateway;
L
Linus Torvalds 已提交
3106 3107 3108
	}

	err = -ENODEV;
3109
	if (!dev)
L
Linus Torvalds 已提交
3110 3111
		goto out;

3112 3113 3114 3115 3116 3117
	if (idev->cnf.disable_ipv6) {
		NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
		err = -EACCES;
		goto out;
	}

3118 3119 3120 3121 3122 3123
	if (!(dev->flags & IFF_UP)) {
		NL_SET_ERR_MSG(extack, "Nexthop device is not up");
		err = -ENETDOWN;
		goto out;
	}

3124 3125
	if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
		if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3126
			NL_SET_ERR_MSG(extack, "Invalid source address");
3127 3128 3129
			err = -EINVAL;
			goto out;
		}
3130 3131
		rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
		rt->fib6_prefsrc.plen = 128;
3132
	} else
3133
		rt->fib6_prefsrc.plen = 0;
3134

3135
	rt->fib6_flags = cfg->fc_flags;
L
Linus Torvalds 已提交
3136 3137

install_route:
3138
	if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3139
	    !netif_carrier_ok(dev))
3140 3141
		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
	rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3142
	rt->fib6_nh.nh_dev = dev;
3143
	rt->fib6_table = table;
3144

3145
	cfg->fc_nlinfo.nl_net = dev_net(dev);
3146

D
David Ahern 已提交
3147 3148 3149
	if (idev)
		in6_dev_put(idev);

3150
	return rt;
3151 3152 3153 3154 3155 3156
out:
	if (dev)
		dev_put(dev);
	if (idev)
		in6_dev_put(idev);

3157
	fib6_info_release(rt);
3158
	return ERR_PTR(err);
3159 3160
}

3161
int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3162
		  struct netlink_ext_ack *extack)
3163
{
3164
	struct fib6_info *rt;
3165 3166
	int err;

3167
	rt = ip6_route_info_create(cfg, gfp_flags, extack);
3168 3169
	if (IS_ERR(rt))
		return PTR_ERR(rt);
3170

3171
	err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3172
	fib6_info_release(rt);
3173

L
Linus Torvalds 已提交
3174 3175 3176
	return err;
}

3177
static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
L
Linus Torvalds 已提交
3178
{
3179
	struct net *net = info->nl_net;
T
Thomas Graf 已提交
3180
	struct fib6_table *table;
3181
	int err;
L
Linus Torvalds 已提交
3182

D
David Ahern 已提交
3183
	if (rt == net->ipv6.fib6_null_entry) {
3184 3185 3186
		err = -ENOENT;
		goto out;
	}
3187

3188
	table = rt->fib6_table;
3189
	spin_lock_bh(&table->tb6_lock);
3190
	err = fib6_del(rt, info);
3191
	spin_unlock_bh(&table->tb6_lock);
L
Linus Torvalds 已提交
3192

3193
out:
3194
	fib6_info_release(rt);
L
Linus Torvalds 已提交
3195 3196 3197
	return err;
}

3198
int ip6_del_rt(struct net *net, struct fib6_info *rt)
3199
{
3200 3201
	struct nl_info info = { .nl_net = net };

3202
	return __ip6_del_rt(rt, &info);
3203 3204
}

3205
static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3206 3207
{
	struct nl_info *info = &cfg->fc_nlinfo;
3208
	struct net *net = info->nl_net;
3209
	struct sk_buff *skb = NULL;
3210
	struct fib6_table *table;
3211
	int err = -ENOENT;
3212

D
David Ahern 已提交
3213
	if (rt == net->ipv6.fib6_null_entry)
3214
		goto out_put;
3215
	table = rt->fib6_table;
3216
	spin_lock_bh(&table->tb6_lock);
3217

3218
	if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3219
		struct fib6_info *sibling, *next_sibling;
3220

3221 3222 3223 3224 3225
		/* prefer to send a single notification with all hops */
		skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
		if (skb) {
			u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;

3226
			if (rt6_fill_node(net, skb, rt, NULL,
3227 3228 3229 3230 3231 3232 3233 3234
					  NULL, NULL, 0, RTM_DELROUTE,
					  info->portid, seq, 0) < 0) {
				kfree_skb(skb);
				skb = NULL;
			} else
				info->skip_notify = 1;
		}

3235
		list_for_each_entry_safe(sibling, next_sibling,
3236 3237
					 &rt->fib6_siblings,
					 fib6_siblings) {
3238 3239
			err = fib6_del(sibling, info);
			if (err)
3240
				goto out_unlock;
3241 3242 3243 3244
		}
	}

	err = fib6_del(rt, info);
3245
out_unlock:
3246
	spin_unlock_bh(&table->tb6_lock);
3247
out_put:
3248
	fib6_info_release(rt);
3249 3250

	if (skb) {
3251
		rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3252 3253
			    info->nlh, gfp_any());
	}
3254 3255 3256
	return err;
}

3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272
static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
{
	int rc = -ESRCH;

	if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
		goto out;

	if (cfg->fc_flags & RTF_GATEWAY &&
	    !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
		goto out;
	if (dst_hold_safe(&rt->dst))
		rc = rt6_remove_exception_rt(rt);
out:
	return rc;
}

3273 3274
static int ip6_route_del(struct fib6_config *cfg,
			 struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
3275
{
3276
	struct rt6_info *rt_cache;
T
Thomas Graf 已提交
3277
	struct fib6_table *table;
3278
	struct fib6_info *rt;
L
Linus Torvalds 已提交
3279 3280 3281
	struct fib6_node *fn;
	int err = -ESRCH;

3282
	table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3283 3284
	if (!table) {
		NL_SET_ERR_MSG(extack, "FIB table does not exist");
T
Thomas Graf 已提交
3285
		return err;
3286
	}
T
Thomas Graf 已提交
3287

3288
	rcu_read_lock();
L
Linus Torvalds 已提交
3289

T
Thomas Graf 已提交
3290
	fn = fib6_locate(&table->tb6_root,
3291
			 &cfg->fc_dst, cfg->fc_dst_len,
3292
			 &cfg->fc_src, cfg->fc_src_len,
3293
			 !(cfg->fc_flags & RTF_CACHE));
3294

L
Linus Torvalds 已提交
3295
	if (fn) {
3296
		for_each_fib6_node_rt_rcu(fn) {
3297
			if (cfg->fc_flags & RTF_CACHE) {
3298 3299
				int rc;

3300 3301
				rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
							      &cfg->fc_src);
3302 3303
				if (rt_cache) {
					rc = ip6_del_cached_rt(rt_cache, cfg);
3304 3305
					if (rc != -ESRCH) {
						rcu_read_unlock();
3306
						return rc;
3307
					}
3308 3309
				}
				continue;
3310
			}
3311
			if (cfg->fc_ifindex &&
3312 3313
			    (!rt->fib6_nh.nh_dev ||
			     rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
L
Linus Torvalds 已提交
3314
				continue;
3315
			if (cfg->fc_flags & RTF_GATEWAY &&
3316
			    !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
L
Linus Torvalds 已提交
3317
				continue;
3318
			if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
L
Linus Torvalds 已提交
3319
				continue;
3320
			if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3321
				continue;
3322 3323
			if (!fib6_info_hold_safe(rt))
				continue;
3324
			rcu_read_unlock();
L
Linus Torvalds 已提交
3325

3326 3327 3328 3329 3330
			/* if gateway was specified only delete the one hop */
			if (cfg->fc_flags & RTF_GATEWAY)
				return __ip6_del_rt(rt, &cfg->fc_nlinfo);

			return __ip6_del_rt_siblings(rt, cfg);
L
Linus Torvalds 已提交
3331 3332
		}
	}
3333
	rcu_read_unlock();
L
Linus Torvalds 已提交
3334 3335 3336 3337

	return err;
}

3338
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3339 3340
{
	struct netevent_redirect netevent;
3341 3342 3343 3344
	struct rt6_info *rt, *nrt = NULL;
	struct ndisc_options ndopts;
	struct inet6_dev *in6_dev;
	struct neighbour *neigh;
3345
	struct fib6_info *from;
3346
	struct rd_msg *msg;
3347 3348
	int optlen, on_link;
	u8 *lladdr;
3349

3350
	optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3351
	optlen -= sizeof(*msg);
3352 3353

	if (optlen < 0) {
3354
		net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3355 3356 3357
		return;
	}

3358
	msg = (struct rd_msg *)icmp6_hdr(skb);
3359

3360
	if (ipv6_addr_is_multicast(&msg->dest)) {
3361
		net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3362 3363 3364
		return;
	}

3365
	on_link = 0;
3366
	if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3367
		on_link = 1;
3368
	} else if (ipv6_addr_type(&msg->target) !=
3369
		   (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3370
		net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3371 3372 3373 3374 3375 3376 3377 3378 3379 3380 3381 3382 3383 3384
		return;
	}

	in6_dev = __in6_dev_get(skb->dev);
	if (!in6_dev)
		return;
	if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
		return;

	/* RFC2461 8.1:
	 *	The IP source address of the Redirect MUST be the same as the current
	 *	first-hop router for the specified ICMP Destination Address.
	 */

3385
	if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3386 3387 3388
		net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
		return;
	}
3389 3390

	lladdr = NULL;
3391 3392 3393 3394 3395 3396 3397 3398 3399
	if (ndopts.nd_opts_tgt_lladdr) {
		lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
					     skb->dev);
		if (!lladdr) {
			net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
			return;
		}
	}

3400
	rt = (struct rt6_info *) dst;
3401
	if (rt->rt6i_flags & RTF_REJECT) {
3402
		net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3403
		return;
3404
	}
3405

3406 3407 3408 3409
	/* Redirect received -> path was valid.
	 * Look, redirects are sent only in response to data packets,
	 * so that this nexthop apparently is reachable. --ANK
	 */
3410
	dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3411

3412
	neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3413 3414
	if (!neigh)
		return;
3415

L
Linus Torvalds 已提交
3416 3417 3418 3419
	/*
	 *	We have finally decided to accept it.
	 */

3420
	ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
L
Linus Torvalds 已提交
3421 3422 3423
		     NEIGH_UPDATE_F_WEAK_OVERRIDE|
		     NEIGH_UPDATE_F_OVERRIDE|
		     (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3424 3425
				     NEIGH_UPDATE_F_ISROUTER)),
		     NDISC_REDIRECT, &ndopts);
L
Linus Torvalds 已提交
3426

3427
	rcu_read_lock();
3428
	from = rcu_dereference(rt->from);
3429 3430 3431
	/* This fib6_info_hold() is safe here because we hold reference to rt
	 * and rt already holds reference to fib6_info.
	 */
3432
	fib6_info_hold(from);
3433
	rcu_read_unlock();
3434 3435

	nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3436
	if (!nrt)
L
Linus Torvalds 已提交
3437 3438 3439 3440 3441 3442
		goto out;

	nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
	if (on_link)
		nrt->rt6i_flags &= ~RTF_GATEWAY;

A
Alexey Dobriyan 已提交
3443
	nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
L
Linus Torvalds 已提交
3444

3445 3446 3447 3448
	/* No need to remove rt from the exception table if rt is
	 * a cached route because rt6_insert_exception() will
	 * takes care of it
	 */
3449
	if (rt6_insert_exception(nrt, from)) {
3450 3451 3452
		dst_release_immediate(&nrt->dst);
		goto out;
	}
L
Linus Torvalds 已提交
3453

3454 3455
	netevent.old = &rt->dst;
	netevent.new = &nrt->dst;
3456
	netevent.daddr = &msg->dest;
3457
	netevent.neigh = neigh;
3458 3459
	call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);

L
Linus Torvalds 已提交
3460
out:
3461
	fib6_info_release(from);
3462
	neigh_release(neigh);
3463 3464
}

3465
#ifdef CONFIG_IPV6_ROUTE_INFO
3466
static struct fib6_info *rt6_get_route_info(struct net *net,
3467
					   const struct in6_addr *prefix, int prefixlen,
3468 3469
					   const struct in6_addr *gwaddr,
					   struct net_device *dev)
3470
{
3471 3472
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
	int ifindex = dev->ifindex;
3473
	struct fib6_node *fn;
3474
	struct fib6_info *rt = NULL;
T
Thomas Graf 已提交
3475 3476
	struct fib6_table *table;

3477
	table = fib6_get_table(net, tb_id);
3478
	if (!table)
T
Thomas Graf 已提交
3479
		return NULL;
3480

3481
	rcu_read_lock();
3482
	fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3483 3484 3485
	if (!fn)
		goto out;

3486
	for_each_fib6_node_rt_rcu(fn) {
3487
		if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3488
			continue;
3489
		if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3490
			continue;
3491
		if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3492
			continue;
3493 3494
		if (!fib6_info_hold_safe(rt))
			continue;
3495 3496 3497
		break;
	}
out:
3498
	rcu_read_unlock();
3499 3500 3501
	return rt;
}

3502
static struct fib6_info *rt6_add_route_info(struct net *net,
3503
					   const struct in6_addr *prefix, int prefixlen,
3504 3505
					   const struct in6_addr *gwaddr,
					   struct net_device *dev,
3506
					   unsigned int pref)
3507
{
3508
	struct fib6_config cfg = {
3509
		.fc_metric	= IP6_RT_PRIO_USER,
3510
		.fc_ifindex	= dev->ifindex,
3511 3512 3513
		.fc_dst_len	= prefixlen,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
				  RTF_UP | RTF_PREF(pref),
3514
		.fc_protocol = RTPROT_RA,
3515
		.fc_type = RTN_UNICAST,
3516
		.fc_nlinfo.portid = 0,
3517 3518
		.fc_nlinfo.nlh = NULL,
		.fc_nlinfo.nl_net = net,
3519 3520
	};

3521
	cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
A
Alexey Dobriyan 已提交
3522 3523
	cfg.fc_dst = *prefix;
	cfg.fc_gateway = *gwaddr;
3524

3525 3526
	/* We should treat it as a default route if prefix length is 0. */
	if (!prefixlen)
3527
		cfg.fc_flags |= RTF_DEFAULT;
3528

3529
	ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3530

3531
	return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3532 3533 3534
}
#endif

3535
struct fib6_info *rt6_get_dflt_router(struct net *net,
3536 3537
				     const struct in6_addr *addr,
				     struct net_device *dev)
3538
{
3539
	u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3540
	struct fib6_info *rt;
T
Thomas Graf 已提交
3541
	struct fib6_table *table;
L
Linus Torvalds 已提交
3542

3543
	table = fib6_get_table(net, tb_id);
3544
	if (!table)
T
Thomas Graf 已提交
3545
		return NULL;
L
Linus Torvalds 已提交
3546

3547 3548
	rcu_read_lock();
	for_each_fib6_node_rt_rcu(&table->tb6_root) {
3549
		if (dev == rt->fib6_nh.nh_dev &&
3550
		    ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3551
		    ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
L
Linus Torvalds 已提交
3552 3553
			break;
	}
3554 3555
	if (rt && !fib6_info_hold_safe(rt))
		rt = NULL;
3556
	rcu_read_unlock();
L
Linus Torvalds 已提交
3557 3558 3559
	return rt;
}

3560
struct fib6_info *rt6_add_dflt_router(struct net *net,
3561
				     const struct in6_addr *gwaddr,
3562 3563
				     struct net_device *dev,
				     unsigned int pref)
L
Linus Torvalds 已提交
3564
{
3565
	struct fib6_config cfg = {
D
David Ahern 已提交
3566
		.fc_table	= l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3567
		.fc_metric	= IP6_RT_PRIO_USER,
3568 3569 3570
		.fc_ifindex	= dev->ifindex,
		.fc_flags	= RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
				  RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3571
		.fc_protocol = RTPROT_RA,
3572
		.fc_type = RTN_UNICAST,
3573
		.fc_nlinfo.portid = 0,
3574
		.fc_nlinfo.nlh = NULL,
3575
		.fc_nlinfo.nl_net = net,
3576
	};
L
Linus Torvalds 已提交
3577

A
Alexey Dobriyan 已提交
3578
	cfg.fc_gateway = *gwaddr;
L
Linus Torvalds 已提交
3579

3580
	if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3581 3582 3583 3584 3585 3586
		struct fib6_table *table;

		table = fib6_get_table(dev_net(dev), cfg.fc_table);
		if (table)
			table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
	}
L
Linus Torvalds 已提交
3587

3588
	return rt6_get_dflt_router(net, gwaddr, dev);
L
Linus Torvalds 已提交
3589 3590
}

3591 3592
static void __rt6_purge_dflt_routers(struct net *net,
				     struct fib6_table *table)
L
Linus Torvalds 已提交
3593
{
3594
	struct fib6_info *rt;
L
Linus Torvalds 已提交
3595 3596

restart:
3597 3598
	rcu_read_lock();
	for_each_fib6_node_rt_rcu(&table->tb6_root) {
D
David Ahern 已提交
3599 3600 3601
		struct net_device *dev = fib6_info_nh_dev(rt);
		struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;

3602
		if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3603 3604
		    (!idev || idev->cnf.accept_ra != 2) &&
		    fib6_info_hold_safe(rt)) {
3605 3606
			rcu_read_unlock();
			ip6_del_rt(net, rt);
L
Linus Torvalds 已提交
3607 3608 3609
			goto restart;
		}
	}
3610
	rcu_read_unlock();
3611 3612 3613 3614 3615 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626

	table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
}

void rt6_purge_dflt_routers(struct net *net)
{
	struct fib6_table *table;
	struct hlist_head *head;
	unsigned int h;

	rcu_read_lock();

	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
		head = &net->ipv6.fib_table_hash[h];
		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
			if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3627
				__rt6_purge_dflt_routers(net, table);
3628 3629 3630 3631
		}
	}

	rcu_read_unlock();
L
Linus Torvalds 已提交
3632 3633
}

3634 3635
static void rtmsg_to_fib6_config(struct net *net,
				 struct in6_rtmsg *rtmsg,
3636 3637 3638 3639
				 struct fib6_config *cfg)
{
	memset(cfg, 0, sizeof(*cfg));

D
David Ahern 已提交
3640 3641
	cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
			 : RT6_TABLE_MAIN;
3642 3643 3644 3645 3646 3647
	cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
	cfg->fc_metric = rtmsg->rtmsg_metric;
	cfg->fc_expires = rtmsg->rtmsg_info;
	cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
	cfg->fc_src_len = rtmsg->rtmsg_src_len;
	cfg->fc_flags = rtmsg->rtmsg_flags;
3648
	cfg->fc_type = rtmsg->rtmsg_type;
3649

3650
	cfg->fc_nlinfo.nl_net = net;
3651

A
Alexey Dobriyan 已提交
3652 3653 3654
	cfg->fc_dst = rtmsg->rtmsg_dst;
	cfg->fc_src = rtmsg->rtmsg_src;
	cfg->fc_gateway = rtmsg->rtmsg_gateway;
3655 3656
}

3657
int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
L
Linus Torvalds 已提交
3658
{
3659
	struct fib6_config cfg;
L
Linus Torvalds 已提交
3660 3661 3662
	struct in6_rtmsg rtmsg;
	int err;

3663
	switch (cmd) {
L
Linus Torvalds 已提交
3664 3665
	case SIOCADDRT:		/* Add a route */
	case SIOCDELRT:		/* Delete a route */
3666
		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
L
Linus Torvalds 已提交
3667 3668 3669 3670 3671
			return -EPERM;
		err = copy_from_user(&rtmsg, arg,
				     sizeof(struct in6_rtmsg));
		if (err)
			return -EFAULT;
3672

3673
		rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3674

L
Linus Torvalds 已提交
3675 3676 3677
		rtnl_lock();
		switch (cmd) {
		case SIOCADDRT:
3678
			err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
L
Linus Torvalds 已提交
3679 3680
			break;
		case SIOCDELRT:
3681
			err = ip6_route_del(&cfg, NULL);
L
Linus Torvalds 已提交
3682 3683 3684 3685 3686 3687 3688
			break;
		default:
			err = -EINVAL;
		}
		rtnl_unlock();

		return err;
3689
	}
L
Linus Torvalds 已提交
3690 3691 3692 3693 3694 3695 3696 3697

	return -EINVAL;
}

/*
 *	Drop the packet on the floor
 */

3698
static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
L
Linus Torvalds 已提交
3699
{
3700
	int type;
E
Eric Dumazet 已提交
3701
	struct dst_entry *dst = skb_dst(skb);
3702 3703
	switch (ipstats_mib_noroutes) {
	case IPSTATS_MIB_INNOROUTES:
3704
		type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
U
Ulrich Weber 已提交
3705
		if (type == IPV6_ADDR_ANY) {
3706 3707
			IP6_INC_STATS(dev_net(dst->dev),
				      __in6_dev_get_safely(skb->dev),
3708
				      IPSTATS_MIB_INADDRERRORS);
3709 3710 3711 3712
			break;
		}
		/* FALLTHROUGH */
	case IPSTATS_MIB_OUTNOROUTES:
3713 3714
		IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
			      ipstats_mib_noroutes);
3715 3716
		break;
	}
3717
	icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
L
Linus Torvalds 已提交
3718 3719 3720 3721
	kfree_skb(skb);
	return 0;
}

3722 3723
static int ip6_pkt_discard(struct sk_buff *skb)
{
3724
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3725 3726
}

E
Eric W. Biederman 已提交
3727
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
L
Linus Torvalds 已提交
3728
{
E
Eric Dumazet 已提交
3729
	skb->dev = skb_dst(skb)->dev;
3730
	return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
L
Linus Torvalds 已提交
3731 3732
}

3733 3734
static int ip6_pkt_prohibit(struct sk_buff *skb)
{
3735
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3736 3737
}

E
Eric W. Biederman 已提交
3738
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3739
{
E
Eric Dumazet 已提交
3740
	skb->dev = skb_dst(skb)->dev;
3741
	return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3742 3743
}

L
Linus Torvalds 已提交
3744 3745 3746 3747
/*
 *	Allocate a dst for local (unicast / anycast) address.
 */

3748 3749 3750 3751
struct fib6_info *addrconf_f6i_alloc(struct net *net,
				     struct inet6_dev *idev,
				     const struct in6_addr *addr,
				     bool anycast, gfp_t gfp_flags)
L
Linus Torvalds 已提交
3752
{
D
David Ahern 已提交
3753
	u32 tb_id;
3754
	struct net_device *dev = idev->dev;
3755
	struct fib6_info *f6i;
3756

3757 3758
	f6i = fib6_info_alloc(gfp_flags);
	if (!f6i)
L
Linus Torvalds 已提交
3759 3760
		return ERR_PTR(-ENOMEM);

3761 3762 3763 3764
	f6i->dst_nocount = true;
	f6i->dst_host = true;
	f6i->fib6_protocol = RTPROT_KERNEL;
	f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3765
	if (anycast) {
3766 3767
		f6i->fib6_type = RTN_ANYCAST;
		f6i->fib6_flags |= RTF_ANYCAST;
3768
	} else {
3769 3770
		f6i->fib6_type = RTN_LOCAL;
		f6i->fib6_flags |= RTF_LOCAL;
3771
	}
L
Linus Torvalds 已提交
3772

3773
	f6i->fib6_nh.nh_gw = *addr;
3774
	dev_hold(dev);
3775 3776 3777
	f6i->fib6_nh.nh_dev = dev;
	f6i->fib6_dst.addr = *addr;
	f6i->fib6_dst.plen = 128;
D
David Ahern 已提交
3778
	tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3779
	f6i->fib6_table = fib6_get_table(net, tb_id);
L
Linus Torvalds 已提交
3780

3781
	return f6i;
L
Linus Torvalds 已提交
3782 3783
}

3784 3785 3786 3787 3788 3789 3790
/* remove deleted ip from prefsrc entries */
struct arg_dev_net_ip {
	struct net_device *dev;
	struct net *net;
	struct in6_addr *addr;
};

3791
static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3792 3793 3794 3795 3796
{
	struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
	struct net *net = ((struct arg_dev_net_ip *)arg)->net;
	struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;

3797
	if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
D
David Ahern 已提交
3798
	    rt != net->ipv6.fib6_null_entry &&
3799
	    ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3800
		spin_lock_bh(&rt6_exception_lock);
3801
		/* remove prefsrc entry */
3802
		rt->fib6_prefsrc.plen = 0;
3803 3804 3805
		/* need to update cache as well */
		rt6_exceptions_remove_prefsrc(rt);
		spin_unlock_bh(&rt6_exception_lock);
3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 3816 3817
	}
	return 0;
}

void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
{
	struct net *net = dev_net(ifp->idev->dev);
	struct arg_dev_net_ip adni = {
		.dev = ifp->idev->dev,
		.net = net,
		.addr = &ifp->addr,
	};
3818
	fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3819 3820
}

3821 3822 3823
#define RTF_RA_ROUTER		(RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)

/* Remove routers and update dst entries when gateway turn into host. */
3824
static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3825 3826 3827
{
	struct in6_addr *gateway = (struct in6_addr *)arg;

3828
	if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3829
	    ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3830 3831
		return -1;
	}
3832 3833 3834 3835 3836 3837 3838

	/* Further clean up cached routes in exception table.
	 * This is needed because cached route may have a different
	 * gateway than its 'parent' in the case of an ip redirect.
	 */
	rt6_exceptions_clean_tohost(rt, gateway);

3839 3840 3841 3842 3843 3844 3845 3846
	return 0;
}

void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
{
	fib6_clean_all(net, fib6_clean_tohost, gateway);
}

3847 3848
struct arg_netdev_event {
	const struct net_device *dev;
3849 3850 3851 3852
	union {
		unsigned int nh_flags;
		unsigned long event;
	};
3853 3854
};

3855
static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3856
{
3857
	struct fib6_info *iter;
3858 3859
	struct fib6_node *fn;

3860 3861
	fn = rcu_dereference_protected(rt->fib6_node,
			lockdep_is_held(&rt->fib6_table->tb6_lock));
3862
	iter = rcu_dereference_protected(fn->leaf,
3863
			lockdep_is_held(&rt->fib6_table->tb6_lock));
3864
	while (iter) {
3865
		if (iter->fib6_metric == rt->fib6_metric &&
3866
		    rt6_qualify_for_ecmp(iter))
3867
			return iter;
3868
		iter = rcu_dereference_protected(iter->fib6_next,
3869
				lockdep_is_held(&rt->fib6_table->tb6_lock));
3870 3871 3872 3873 3874
	}

	return NULL;
}

3875
static bool rt6_is_dead(const struct fib6_info *rt)
3876
{
3877 3878
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
	    (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
D
David Ahern 已提交
3879
	     fib6_ignore_linkdown(rt)))
3880 3881 3882 3883 3884
		return true;

	return false;
}

3885
static int rt6_multipath_total_weight(const struct fib6_info *rt)
3886
{
3887
	struct fib6_info *iter;
3888 3889 3890
	int total = 0;

	if (!rt6_is_dead(rt))
3891
		total += rt->fib6_nh.nh_weight;
3892

3893
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3894
		if (!rt6_is_dead(iter))
3895
			total += iter->fib6_nh.nh_weight;
3896 3897 3898 3899 3900
	}

	return total;
}

3901
static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3902 3903 3904 3905
{
	int upper_bound = -1;

	if (!rt6_is_dead(rt)) {
3906
		*weight += rt->fib6_nh.nh_weight;
3907 3908 3909
		upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
						    total) - 1;
	}
3910
	atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3911 3912
}

3913
static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3914
{
3915
	struct fib6_info *iter;
3916 3917 3918 3919
	int weight = 0;

	rt6_upper_bound_set(rt, &weight, total);

3920
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3921 3922 3923
		rt6_upper_bound_set(iter, &weight, total);
}

3924
void rt6_multipath_rebalance(struct fib6_info *rt)
3925
{
3926
	struct fib6_info *first;
3927 3928 3929 3930 3931 3932
	int total;

	/* In case the entire multipath route was marked for flushing,
	 * then there is no need to rebalance upon the removal of every
	 * sibling route.
	 */
3933
	if (!rt->fib6_nsiblings || rt->should_flush)
3934 3935 3936 3937 3938 3939 3940 3941 3942 3943 3944 3945 3946 3947
		return;

	/* During lookup routes are evaluated in order, so we need to
	 * make sure upper bounds are assigned from the first sibling
	 * onwards.
	 */
	first = rt6_multipath_first_sibling(rt);
	if (WARN_ON_ONCE(!first))
		return;

	total = rt6_multipath_total_weight(first);
	rt6_multipath_upper_bound_set(first, total);
}

3948
static int fib6_ifup(struct fib6_info *rt, void *p_arg)
3949 3950
{
	const struct arg_netdev_event *arg = p_arg;
3951
	struct net *net = dev_net(arg->dev);
3952

D
David Ahern 已提交
3953
	if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
3954
		rt->fib6_nh.nh_flags &= ~arg->nh_flags;
3955
		fib6_update_sernum_upto_root(net, rt);
3956
		rt6_multipath_rebalance(rt);
3957
	}
3958 3959 3960 3961 3962 3963 3964 3965

	return 0;
}

void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
{
	struct arg_netdev_event arg = {
		.dev = dev,
I
Ido Schimmel 已提交
3966 3967 3968
		{
			.nh_flags = nh_flags,
		},
3969 3970 3971 3972 3973 3974 3975 3976
	};

	if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
		arg.nh_flags |= RTNH_F_LINKDOWN;

	fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
}

3977
static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
3978 3979
				   const struct net_device *dev)
{
3980
	struct fib6_info *iter;
3981

3982
	if (rt->fib6_nh.nh_dev == dev)
3983
		return true;
3984
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3985
		if (iter->fib6_nh.nh_dev == dev)
3986 3987 3988 3989 3990
			return true;

	return false;
}

3991
static void rt6_multipath_flush(struct fib6_info *rt)
3992
{
3993
	struct fib6_info *iter;
3994 3995

	rt->should_flush = 1;
3996
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3997 3998 3999
		iter->should_flush = 1;
}

4000
static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4001 4002
					     const struct net_device *down_dev)
{
4003
	struct fib6_info *iter;
4004 4005
	unsigned int dead = 0;

4006 4007
	if (rt->fib6_nh.nh_dev == down_dev ||
	    rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4008
		dead++;
4009
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4010 4011
		if (iter->fib6_nh.nh_dev == down_dev ||
		    iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4012 4013 4014 4015 4016
			dead++;

	return dead;
}

4017
static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4018 4019 4020
				       const struct net_device *dev,
				       unsigned int nh_flags)
{
4021
	struct fib6_info *iter;
4022

4023 4024
	if (rt->fib6_nh.nh_dev == dev)
		rt->fib6_nh.nh_flags |= nh_flags;
4025
	list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4026 4027
		if (iter->fib6_nh.nh_dev == dev)
			iter->fib6_nh.nh_flags |= nh_flags;
4028 4029
}

4030
/* called with write lock held for table with rt */
4031
static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4032
{
4033 4034
	const struct arg_netdev_event *arg = p_arg;
	const struct net_device *dev = arg->dev;
4035
	struct net *net = dev_net(dev);
4036

D
David Ahern 已提交
4037
	if (rt == net->ipv6.fib6_null_entry)
4038 4039 4040 4041
		return 0;

	switch (arg->event) {
	case NETDEV_UNREGISTER:
4042
		return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4043
	case NETDEV_DOWN:
4044
		if (rt->should_flush)
4045
			return -1;
4046
		if (!rt->fib6_nsiblings)
4047
			return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4048 4049 4050 4051
		if (rt6_multipath_uses_dev(rt, dev)) {
			unsigned int count;

			count = rt6_multipath_dead_count(rt, dev);
4052
			if (rt->fib6_nsiblings + 1 == count) {
4053 4054 4055 4056 4057
				rt6_multipath_flush(rt);
				return -1;
			}
			rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
						   RTNH_F_LINKDOWN);
4058
			fib6_update_sernum(net, rt);
4059
			rt6_multipath_rebalance(rt);
4060 4061
		}
		return -2;
4062
	case NETDEV_CHANGE:
4063
		if (rt->fib6_nh.nh_dev != dev ||
4064
		    rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4065
			break;
4066
		rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4067
		rt6_multipath_rebalance(rt);
4068
		break;
4069
	}
4070

L
Linus Torvalds 已提交
4071 4072 4073
	return 0;
}

4074
void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
L
Linus Torvalds 已提交
4075
{
4076
	struct arg_netdev_event arg = {
4077
		.dev = dev,
I
Ido Schimmel 已提交
4078 4079 4080
		{
			.event = event,
		},
4081 4082
	};

4083 4084 4085 4086 4087 4088 4089 4090
	fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
}

void rt6_disable_ip(struct net_device *dev, unsigned long event)
{
	rt6_sync_down_dev(dev, event);
	rt6_uncached_list_flush_dev(dev_net(dev), dev);
	neigh_ifdown(&nd_tbl, dev);
L
Linus Torvalds 已提交
4091 4092
}

4093
struct rt6_mtu_change_arg {
L
Linus Torvalds 已提交
4094
	struct net_device *dev;
4095
	unsigned int mtu;
L
Linus Torvalds 已提交
4096 4097
};

4098
static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4099 4100 4101 4102 4103 4104 4105 4106 4107 4108 4109
{
	struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
	struct inet6_dev *idev;

	/* In IPv6 pmtu discovery is not optional,
	   so that RTAX_MTU lock cannot disable it.
	   We still use this lock to block changes
	   caused by addrconf/ndisc.
	*/

	idev = __in6_dev_get(arg->dev);
4110
	if (!idev)
L
Linus Torvalds 已提交
4111 4112 4113 4114 4115 4116 4117
		return 0;

	/* For administrative MTU increase, there is no way to discover
	   IPv6 PMTU increase, so PMTU increase should be updated here.
	   Since RFC 1981 doesn't include administrative MTU increase
	   update PMTU increase is a MUST. (i.e. jumbo frame)
	 */
4118
	if (rt->fib6_nh.nh_dev == arg->dev &&
4119 4120 4121 4122 4123 4124 4125
	    !fib6_metric_locked(rt, RTAX_MTU)) {
		u32 mtu = rt->fib6_pmtu;

		if (mtu >= arg->mtu ||
		    (mtu < arg->mtu && mtu == idev->cnf.mtu6))
			fib6_metric_set(rt, RTAX_MTU, arg->mtu);

4126
		spin_lock_bh(&rt6_exception_lock);
4127
		rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4128
		spin_unlock_bh(&rt6_exception_lock);
4129
	}
L
Linus Torvalds 已提交
4130 4131 4132
	return 0;
}

4133
void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
L
Linus Torvalds 已提交
4134
{
T
Thomas Graf 已提交
4135 4136 4137 4138
	struct rt6_mtu_change_arg arg = {
		.dev = dev,
		.mtu = mtu,
	};
L
Linus Torvalds 已提交
4139

4140
	fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
L
Linus Torvalds 已提交
4141 4142
}

4143
static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4144
	[RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
4145
	[RTA_PREFSRC]		= { .len = sizeof(struct in6_addr) },
4146
	[RTA_OIF]               = { .type = NLA_U32 },
4147
	[RTA_IIF]		= { .type = NLA_U32 },
4148 4149
	[RTA_PRIORITY]          = { .type = NLA_U32 },
	[RTA_METRICS]           = { .type = NLA_NESTED },
4150
	[RTA_MULTIPATH]		= { .len = sizeof(struct rtnexthop) },
4151
	[RTA_PREF]              = { .type = NLA_U8 },
4152 4153
	[RTA_ENCAP_TYPE]	= { .type = NLA_U16 },
	[RTA_ENCAP]		= { .type = NLA_NESTED },
4154
	[RTA_EXPIRES]		= { .type = NLA_U32 },
4155
	[RTA_UID]		= { .type = NLA_U32 },
4156
	[RTA_MARK]		= { .type = NLA_U32 },
4157
	[RTA_TABLE]		= { .type = NLA_U32 },
4158 4159 4160
	[RTA_IP_PROTO]		= { .type = NLA_U8 },
	[RTA_SPORT]		= { .type = NLA_U16 },
	[RTA_DPORT]		= { .type = NLA_U16 },
4161 4162 4163
};

static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4164 4165
			      struct fib6_config *cfg,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4166
{
4167 4168
	struct rtmsg *rtm;
	struct nlattr *tb[RTA_MAX+1];
4169
	unsigned int pref;
4170
	int err;
L
Linus Torvalds 已提交
4171

4172 4173
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
			  NULL);
4174 4175
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
4176

4177 4178 4179 4180 4181 4182 4183 4184 4185
	err = -EINVAL;
	rtm = nlmsg_data(nlh);
	memset(cfg, 0, sizeof(*cfg));

	cfg->fc_table = rtm->rtm_table;
	cfg->fc_dst_len = rtm->rtm_dst_len;
	cfg->fc_src_len = rtm->rtm_src_len;
	cfg->fc_flags = RTF_UP;
	cfg->fc_protocol = rtm->rtm_protocol;
4186
	cfg->fc_type = rtm->rtm_type;
4187

4188 4189
	if (rtm->rtm_type == RTN_UNREACHABLE ||
	    rtm->rtm_type == RTN_BLACKHOLE ||
4190 4191
	    rtm->rtm_type == RTN_PROHIBIT ||
	    rtm->rtm_type == RTN_THROW)
4192 4193
		cfg->fc_flags |= RTF_REJECT;

4194 4195 4196
	if (rtm->rtm_type == RTN_LOCAL)
		cfg->fc_flags |= RTF_LOCAL;

4197 4198 4199
	if (rtm->rtm_flags & RTM_F_CLONED)
		cfg->fc_flags |= RTF_CACHE;

4200 4201
	cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);

4202
	cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4203
	cfg->fc_nlinfo.nlh = nlh;
4204
	cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4205 4206

	if (tb[RTA_GATEWAY]) {
4207
		cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4208
		cfg->fc_flags |= RTF_GATEWAY;
L
Linus Torvalds 已提交
4209
	}
4210 4211 4212 4213 4214 4215 4216 4217

	if (tb[RTA_DST]) {
		int plen = (rtm->rtm_dst_len + 7) >> 3;

		if (nla_len(tb[RTA_DST]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
L
Linus Torvalds 已提交
4218
	}
4219 4220 4221 4222 4223 4224 4225 4226

	if (tb[RTA_SRC]) {
		int plen = (rtm->rtm_src_len + 7) >> 3;

		if (nla_len(tb[RTA_SRC]) < plen)
			goto errout;

		nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
L
Linus Torvalds 已提交
4227
	}
4228

4229
	if (tb[RTA_PREFSRC])
4230
		cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4231

4232 4233 4234 4235 4236 4237 4238 4239 4240
	if (tb[RTA_OIF])
		cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);

	if (tb[RTA_PRIORITY])
		cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);

	if (tb[RTA_METRICS]) {
		cfg->fc_mx = nla_data(tb[RTA_METRICS]);
		cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
L
Linus Torvalds 已提交
4241
	}
4242 4243 4244 4245

	if (tb[RTA_TABLE])
		cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);

4246 4247 4248
	if (tb[RTA_MULTIPATH]) {
		cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
		cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4249 4250

		err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4251
						     cfg->fc_mp_len, extack);
4252 4253
		if (err < 0)
			goto errout;
4254 4255
	}

4256 4257 4258 4259 4260 4261 4262 4263
	if (tb[RTA_PREF]) {
		pref = nla_get_u8(tb[RTA_PREF]);
		if (pref != ICMPV6_ROUTER_PREF_LOW &&
		    pref != ICMPV6_ROUTER_PREF_HIGH)
			pref = ICMPV6_ROUTER_PREF_MEDIUM;
		cfg->fc_flags |= RTF_PREF(pref);
	}

4264 4265 4266
	if (tb[RTA_ENCAP])
		cfg->fc_encap = tb[RTA_ENCAP];

4267
	if (tb[RTA_ENCAP_TYPE]) {
4268 4269
		cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);

4270
		err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4271 4272 4273 4274
		if (err < 0)
			goto errout;
	}

4275 4276 4277 4278 4279 4280 4281 4282 4283
	if (tb[RTA_EXPIRES]) {
		unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);

		if (addrconf_finite_timeout(timeout)) {
			cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
			cfg->fc_flags |= RTF_EXPIRES;
		}
	}

4284 4285 4286
	err = 0;
errout:
	return err;
L
Linus Torvalds 已提交
4287 4288
}

4289
struct rt6_nh {
4290
	struct fib6_info *fib6_info;
4291 4292 4293 4294 4295 4296 4297 4298 4299
	struct fib6_config r_cfg;
	struct list_head next;
};

static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
{
	struct rt6_nh *nh;

	list_for_each_entry(nh, rt6_nh_list, next) {
4300
		pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4301 4302 4303 4304 4305
		        &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
		        nh->r_cfg.fc_ifindex);
	}
}

4306 4307
static int ip6_route_info_append(struct net *net,
				 struct list_head *rt6_nh_list,
4308 4309
				 struct fib6_info *rt,
				 struct fib6_config *r_cfg)
4310 4311 4312 4313 4314
{
	struct rt6_nh *nh;
	int err = -EEXIST;

	list_for_each_entry(nh, rt6_nh_list, next) {
4315 4316
		/* check if fib6_info already exists */
		if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4317 4318 4319 4320 4321 4322
			return err;
	}

	nh = kzalloc(sizeof(*nh), GFP_KERNEL);
	if (!nh)
		return -ENOMEM;
4323
	nh->fib6_info = rt;
4324 4325 4326 4327 4328 4329
	memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
	list_add_tail(&nh->next, rt6_nh_list);

	return 0;
}

4330 4331
static void ip6_route_mpath_notify(struct fib6_info *rt,
				   struct fib6_info *rt_last,
4332 4333 4334 4335 4336 4337 4338 4339 4340
				   struct nl_info *info,
				   __u16 nlflags)
{
	/* if this is an APPEND route, then rt points to the first route
	 * inserted and rt_last points to last route inserted. Userspace
	 * wants a consistent dump of the route which starts at the first
	 * nexthop. Since sibling routes are always added at the end of
	 * the list, find the first sibling of the last route appended
	 */
4341 4342
	if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
		rt = list_first_entry(&rt_last->fib6_siblings,
4343
				      struct fib6_info,
4344
				      fib6_siblings);
4345 4346 4347 4348 4349 4350
	}

	if (rt)
		inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
}

4351 4352
static int ip6_route_multipath_add(struct fib6_config *cfg,
				   struct netlink_ext_ack *extack)
4353
{
4354
	struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4355
	struct nl_info *info = &cfg->fc_nlinfo;
4356 4357
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
4358
	struct fib6_info *rt;
4359 4360
	struct rt6_nh *err_nh;
	struct rt6_nh *nh, *nh_safe;
4361
	__u16 nlflags;
4362 4363
	int remaining;
	int attrlen;
4364 4365 4366 4367 4368
	int err = 1;
	int nhn = 0;
	int replace = (cfg->fc_nlinfo.nlh &&
		       (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
	LIST_HEAD(rt6_nh_list);
4369

4370 4371 4372 4373
	nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
	if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
		nlflags |= NLM_F_APPEND;

4374
	remaining = cfg->fc_mp_len;
4375 4376
	rtnh = (struct rtnexthop *)cfg->fc_mp;

4377
	/* Parse a Multipath Entry and build a list (rt6_nh_list) of
4378
	 * fib6_info structs per nexthop
4379
	 */
4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
4391
				r_cfg.fc_gateway = nla_get_in6_addr(nla);
4392 4393
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
4394 4395 4396 4397
			r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
			nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
			if (nla)
				r_cfg.fc_encap_type = nla_get_u16(nla);
4398
		}
4399

4400
		r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4401
		rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4402 4403 4404
		if (IS_ERR(rt)) {
			err = PTR_ERR(rt);
			rt = NULL;
4405
			goto cleanup;
4406
		}
4407 4408 4409 4410 4411 4412 4413
		if (!rt6_qualify_for_ecmp(rt)) {
			err = -EINVAL;
			NL_SET_ERR_MSG(extack,
				       "Device only routes can not be added for IPv6 using the multipath API.");
			fib6_info_release(rt);
			goto cleanup;
		}
4414

4415
		rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4416

4417 4418
		err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
					    rt, &r_cfg);
4419
		if (err) {
4420
			fib6_info_release(rt);
4421 4422 4423 4424 4425 4426
			goto cleanup;
		}

		rtnh = rtnh_next(rtnh, &remaining);
	}

4427 4428 4429 4430 4431 4432
	/* for add and replace send one notification with all nexthops.
	 * Skip the notification in fib6_add_rt2node and send one with
	 * the full route when done
	 */
	info->skip_notify = 1;

4433 4434
	err_nh = NULL;
	list_for_each_entry(nh, &rt6_nh_list, next) {
4435 4436
		err = __ip6_ins_rt(nh->fib6_info, info, extack);
		fib6_info_release(nh->fib6_info);
4437

4438 4439 4440 4441 4442 4443 4444 4445
		if (!err) {
			/* save reference to last route successfully inserted */
			rt_last = nh->fib6_info;

			/* save reference to first route for notification */
			if (!rt_notif)
				rt_notif = nh->fib6_info;
		}
4446

4447 4448
		/* nh->fib6_info is used or freed at this point, reset to NULL*/
		nh->fib6_info = NULL;
4449 4450 4451 4452 4453
		if (err) {
			if (replace && nhn)
				ip6_print_replace_route_err(&rt6_nh_list);
			err_nh = nh;
			goto add_errout;
4454
		}
4455

4456
		/* Because each route is added like a single route we remove
4457 4458 4459 4460 4461
		 * these flags after the first nexthop: if there is a collision,
		 * we have already failed to add the first nexthop:
		 * fib6_add_rt2node() has rejected it; when replacing, old
		 * nexthops have been replaced by first new, the rest should
		 * be added to it.
4462
		 */
4463 4464
		cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
						     NLM_F_REPLACE);
4465 4466 4467
		nhn++;
	}

4468 4469
	/* success ... tell user about new route */
	ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4470 4471 4472
	goto cleanup;

add_errout:
4473 4474 4475 4476 4477 4478 4479
	/* send notification for routes that were added so that
	 * the delete notifications sent by ip6_route_del are
	 * coherent
	 */
	if (rt_notif)
		ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);

4480 4481 4482 4483
	/* Delete routes that were already added */
	list_for_each_entry(nh, &rt6_nh_list, next) {
		if (err_nh == nh)
			break;
4484
		ip6_route_del(&nh->r_cfg, extack);
4485 4486 4487 4488
	}

cleanup:
	list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4489 4490
		if (nh->fib6_info)
			fib6_info_release(nh->fib6_info);
4491 4492 4493 4494 4495 4496 4497
		list_del(&nh->next);
		kfree(nh);
	}

	return err;
}

4498 4499
static int ip6_route_multipath_del(struct fib6_config *cfg,
				   struct netlink_ext_ack *extack)
4500 4501 4502 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525
{
	struct fib6_config r_cfg;
	struct rtnexthop *rtnh;
	int remaining;
	int attrlen;
	int err = 1, last_err = 0;

	remaining = cfg->fc_mp_len;
	rtnh = (struct rtnexthop *)cfg->fc_mp;

	/* Parse a Multipath Entry */
	while (rtnh_ok(rtnh, remaining)) {
		memcpy(&r_cfg, cfg, sizeof(*cfg));
		if (rtnh->rtnh_ifindex)
			r_cfg.fc_ifindex = rtnh->rtnh_ifindex;

		attrlen = rtnh_attrlen(rtnh);
		if (attrlen > 0) {
			struct nlattr *nla, *attrs = rtnh_attrs(rtnh);

			nla = nla_find(attrs, attrlen, RTA_GATEWAY);
			if (nla) {
				nla_memcpy(&r_cfg.fc_gateway, nla, 16);
				r_cfg.fc_flags |= RTF_GATEWAY;
			}
		}
4526
		err = ip6_route_del(&r_cfg, extack);
4527 4528 4529
		if (err)
			last_err = err;

4530 4531 4532 4533 4534 4535
		rtnh = rtnh_next(rtnh, &remaining);
	}

	return last_err;
}

4536 4537
static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4538
{
4539 4540
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
4541

4542
	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4543 4544 4545
	if (err < 0)
		return err;

4546
	if (cfg.fc_mp)
4547
		return ip6_route_multipath_del(&cfg, extack);
4548 4549
	else {
		cfg.fc_delete_all_nh = 1;
4550
		return ip6_route_del(&cfg, extack);
4551
	}
L
Linus Torvalds 已提交
4552 4553
}

4554 4555
static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4556
{
4557 4558
	struct fib6_config cfg;
	int err;
L
Linus Torvalds 已提交
4559

4560
	err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4561 4562 4563
	if (err < 0)
		return err;

4564
	if (cfg.fc_mp)
4565
		return ip6_route_multipath_add(&cfg, extack);
4566
	else
4567
		return ip6_route_add(&cfg, GFP_KERNEL, extack);
L
Linus Torvalds 已提交
4568 4569
}

4570
static size_t rt6_nlmsg_size(struct fib6_info *rt)
4571
{
4572 4573
	int nexthop_len = 0;

4574
	if (rt->fib6_nsiblings) {
4575 4576 4577
		nexthop_len = nla_total_size(0)	 /* RTA_MULTIPATH */
			    + NLA_ALIGN(sizeof(struct rtnexthop))
			    + nla_total_size(16) /* RTA_GATEWAY */
4578
			    + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4579

4580
		nexthop_len *= rt->fib6_nsiblings;
4581 4582
	}

4583 4584 4585 4586 4587 4588 4589 4590 4591
	return NLMSG_ALIGN(sizeof(struct rtmsg))
	       + nla_total_size(16) /* RTA_SRC */
	       + nla_total_size(16) /* RTA_DST */
	       + nla_total_size(16) /* RTA_GATEWAY */
	       + nla_total_size(16) /* RTA_PREFSRC */
	       + nla_total_size(4) /* RTA_TABLE */
	       + nla_total_size(4) /* RTA_IIF */
	       + nla_total_size(4) /* RTA_OIF */
	       + nla_total_size(4) /* RTA_PRIORITY */
4592
	       + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4593
	       + nla_total_size(sizeof(struct rta_cacheinfo))
4594
	       + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4595
	       + nla_total_size(1) /* RTA_PREF */
4596
	       + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4597 4598 4599
	       + nexthop_len;
}

4600
static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4601
			    unsigned int *flags, bool skip_oif)
4602
{
4603
	if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4604 4605
		*flags |= RTNH_F_DEAD;

4606
	if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4607
		*flags |= RTNH_F_LINKDOWN;
D
David Ahern 已提交
4608 4609 4610

		rcu_read_lock();
		if (fib6_ignore_linkdown(rt))
4611
			*flags |= RTNH_F_DEAD;
D
David Ahern 已提交
4612
		rcu_read_unlock();
4613 4614
	}

4615
	if (rt->fib6_flags & RTF_GATEWAY) {
4616
		if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4617 4618 4619
			goto nla_put_failure;
	}

4620 4621
	*flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
	if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4622 4623
		*flags |= RTNH_F_OFFLOAD;

4624
	/* not needed for multipath encoding b/c it has a rtnexthop struct */
4625 4626
	if (!skip_oif && rt->fib6_nh.nh_dev &&
	    nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4627 4628
		goto nla_put_failure;

4629 4630
	if (rt->fib6_nh.nh_lwtstate &&
	    lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4631 4632 4633 4634 4635 4636 4637 4638
		goto nla_put_failure;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
}

4639
/* add multipath next hop */
4640
static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4641
{
4642
	const struct net_device *dev = rt->fib6_nh.nh_dev;
4643 4644 4645 4646 4647 4648 4649
	struct rtnexthop *rtnh;
	unsigned int flags = 0;

	rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
	if (!rtnh)
		goto nla_put_failure;

4650 4651
	rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
	rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4652

4653
	if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4654 4655 4656 4657 4658 4659 4660 4661 4662 4663 4664
		goto nla_put_failure;

	rtnh->rtnh_flags = flags;

	/* length of rtnetlink header + attributes */
	rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;

	return 0;

nla_put_failure:
	return -EMSGSIZE;
4665 4666
}

4667
static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4668
			 struct fib6_info *rt, struct dst_entry *dst,
4669
			 struct in6_addr *dest, struct in6_addr *src,
4670
			 int iif, int type, u32 portid, u32 seq,
4671
			 unsigned int flags)
L
Linus Torvalds 已提交
4672
{
4673 4674 4675
	struct rt6_info *rt6 = (struct rt6_info *)dst;
	struct rt6key *rt6_dst, *rt6_src;
	u32 *pmetrics, table, rt6_flags;
4676
	struct nlmsghdr *nlh;
4677
	struct rtmsg *rtm;
4678
	long expires = 0;
L
Linus Torvalds 已提交
4679

4680
	nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4681
	if (!nlh)
4682
		return -EMSGSIZE;
4683

4684 4685 4686 4687 4688 4689 4690 4691 4692 4693
	if (rt6) {
		rt6_dst = &rt6->rt6i_dst;
		rt6_src = &rt6->rt6i_src;
		rt6_flags = rt6->rt6i_flags;
	} else {
		rt6_dst = &rt->fib6_dst;
		rt6_src = &rt->fib6_src;
		rt6_flags = rt->fib6_flags;
	}

4694
	rtm = nlmsg_data(nlh);
L
Linus Torvalds 已提交
4695
	rtm->rtm_family = AF_INET6;
4696 4697
	rtm->rtm_dst_len = rt6_dst->plen;
	rtm->rtm_src_len = rt6_src->plen;
L
Linus Torvalds 已提交
4698
	rtm->rtm_tos = 0;
4699 4700
	if (rt->fib6_table)
		table = rt->fib6_table->tb6_id;
T
Thomas Graf 已提交
4701
	else
4702 4703
		table = RT6_TABLE_UNSPEC;
	rtm->rtm_table = table;
D
David S. Miller 已提交
4704 4705
	if (nla_put_u32(skb, RTA_TABLE, table))
		goto nla_put_failure;
4706 4707

	rtm->rtm_type = rt->fib6_type;
L
Linus Torvalds 已提交
4708 4709
	rtm->rtm_flags = 0;
	rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4710
	rtm->rtm_protocol = rt->fib6_protocol;
L
Linus Torvalds 已提交
4711

4712
	if (rt6_flags & RTF_CACHE)
L
Linus Torvalds 已提交
4713 4714
		rtm->rtm_flags |= RTM_F_CLONED;

4715 4716
	if (dest) {
		if (nla_put_in6_addr(skb, RTA_DST, dest))
D
David S. Miller 已提交
4717
			goto nla_put_failure;
4718
		rtm->rtm_dst_len = 128;
L
Linus Torvalds 已提交
4719
	} else if (rtm->rtm_dst_len)
4720
		if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
D
David S. Miller 已提交
4721
			goto nla_put_failure;
L
Linus Torvalds 已提交
4722 4723
#ifdef CONFIG_IPV6_SUBTREES
	if (src) {
4724
		if (nla_put_in6_addr(skb, RTA_SRC, src))
D
David S. Miller 已提交
4725
			goto nla_put_failure;
4726
		rtm->rtm_src_len = 128;
D
David S. Miller 已提交
4727
	} else if (rtm->rtm_src_len &&
4728
		   nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
D
David S. Miller 已提交
4729
		goto nla_put_failure;
L
Linus Torvalds 已提交
4730
#endif
4731 4732
	if (iif) {
#ifdef CONFIG_IPV6_MROUTE
4733
		if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4734 4735 4736 4737 4738 4739
			int err = ip6mr_get_route(net, skb, rtm, portid);

			if (err == 0)
				return 0;
			if (err < 0)
				goto nla_put_failure;
4740 4741
		} else
#endif
D
David S. Miller 已提交
4742 4743
			if (nla_put_u32(skb, RTA_IIF, iif))
				goto nla_put_failure;
4744
	} else if (dest) {
L
Linus Torvalds 已提交
4745
		struct in6_addr saddr_buf;
4746
		if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4747
		    nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
4748
			goto nla_put_failure;
L
Linus Torvalds 已提交
4749
	}
4750

4751
	if (rt->fib6_prefsrc.plen) {
4752
		struct in6_addr saddr_buf;
4753
		saddr_buf = rt->fib6_prefsrc.addr;
4754
		if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
D
David S. Miller 已提交
4755
			goto nla_put_failure;
4756 4757
	}

4758 4759
	pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
	if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4760 4761
		goto nla_put_failure;

4762
	if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
D
David S. Miller 已提交
4763
		goto nla_put_failure;
4764

4765 4766 4767
	/* For multipath routes, walk the siblings list and add
	 * each as a nexthop within RTA_MULTIPATH.
	 */
4768 4769 4770 4771 4772 4773 4774 4775
	if (rt6) {
		if (rt6_flags & RTF_GATEWAY &&
		    nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
			goto nla_put_failure;

		if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
			goto nla_put_failure;
	} else if (rt->fib6_nsiblings) {
4776
		struct fib6_info *sibling, *next_sibling;
4777 4778 4779 4780 4781 4782 4783 4784 4785 4786
		struct nlattr *mp;

		mp = nla_nest_start(skb, RTA_MULTIPATH);
		if (!mp)
			goto nla_put_failure;

		if (rt6_add_nexthop(skb, rt) < 0)
			goto nla_put_failure;

		list_for_each_entry_safe(sibling, next_sibling,
4787
					 &rt->fib6_siblings, fib6_siblings) {
4788 4789 4790 4791 4792 4793
			if (rt6_add_nexthop(skb, sibling) < 0)
				goto nla_put_failure;
		}

		nla_nest_end(skb, mp);
	} else {
4794
		if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4795 4796 4797
			goto nla_put_failure;
	}

4798
	if (rt6_flags & RTF_EXPIRES) {
4799 4800 4801
		expires = dst ? dst->expires : rt->expires;
		expires -= jiffies;
	}
4802

4803
	if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4804
		goto nla_put_failure;
4805

4806
	if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4807 4808
		goto nla_put_failure;

4809

4810 4811
	nlmsg_end(skb, nlh);
	return 0;
4812 4813

nla_put_failure:
4814 4815
	nlmsg_cancel(skb, nlh);
	return -EMSGSIZE;
L
Linus Torvalds 已提交
4816 4817
}

4818
int rt6_dump_route(struct fib6_info *rt, void *p_arg)
L
Linus Torvalds 已提交
4819 4820
{
	struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4821 4822
	struct net *net = arg->net;

D
David Ahern 已提交
4823
	if (rt == net->ipv6.fib6_null_entry)
4824
		return 0;
L
Linus Torvalds 已提交
4825

4826 4827
	if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
		struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4828 4829 4830

		/* user wants prefix routes only */
		if (rtm->rtm_flags & RTM_F_PREFIX &&
4831
		    !(rt->fib6_flags & RTF_PREFIX_RT)) {
4832 4833 4834 4835
			/* success since this is not a prefix route */
			return 1;
		}
	}
L
Linus Torvalds 已提交
4836

4837 4838 4839
	return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
			     RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
			     arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
L
Linus Torvalds 已提交
4840 4841
}

4842 4843
static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
			      struct netlink_ext_ack *extack)
L
Linus Torvalds 已提交
4844
{
4845
	struct net *net = sock_net(in_skb->sk);
4846
	struct nlattr *tb[RTA_MAX+1];
4847
	int err, iif = 0, oif = 0;
4848
	struct fib6_info *from;
4849
	struct dst_entry *dst;
4850
	struct rt6_info *rt;
L
Linus Torvalds 已提交
4851
	struct sk_buff *skb;
4852
	struct rtmsg *rtm;
4853
	struct flowi6 fl6;
4854
	bool fibmatch;
L
Linus Torvalds 已提交
4855

4856
	err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4857
			  extack);
4858 4859
	if (err < 0)
		goto errout;
L
Linus Torvalds 已提交
4860

4861
	err = -EINVAL;
4862
	memset(&fl6, 0, sizeof(fl6));
4863 4864
	rtm = nlmsg_data(nlh);
	fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4865
	fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
L
Linus Torvalds 已提交
4866

4867 4868 4869 4870
	if (tb[RTA_SRC]) {
		if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
4871
		fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4872 4873 4874 4875 4876 4877
	}

	if (tb[RTA_DST]) {
		if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
			goto errout;

A
Alexey Dobriyan 已提交
4878
		fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4879 4880 4881 4882 4883 4884
	}

	if (tb[RTA_IIF])
		iif = nla_get_u32(tb[RTA_IIF]);

	if (tb[RTA_OIF])
4885
		oif = nla_get_u32(tb[RTA_OIF]);
L
Linus Torvalds 已提交
4886

4887 4888 4889
	if (tb[RTA_MARK])
		fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);

4890 4891 4892 4893 4894 4895
	if (tb[RTA_UID])
		fl6.flowi6_uid = make_kuid(current_user_ns(),
					   nla_get_u32(tb[RTA_UID]));
	else
		fl6.flowi6_uid = iif ? INVALID_UID : current_uid();

4896 4897 4898 4899 4900 4901 4902 4903 4904 4905 4906 4907 4908
	if (tb[RTA_SPORT])
		fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);

	if (tb[RTA_DPORT])
		fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);

	if (tb[RTA_IP_PROTO]) {
		err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
						  &fl6.flowi6_proto, extack);
		if (err)
			goto errout;
	}

L
Linus Torvalds 已提交
4909 4910
	if (iif) {
		struct net_device *dev;
4911 4912
		int flags = 0;

4913 4914 4915
		rcu_read_lock();

		dev = dev_get_by_index_rcu(net, iif);
L
Linus Torvalds 已提交
4916
		if (!dev) {
4917
			rcu_read_unlock();
L
Linus Torvalds 已提交
4918
			err = -ENODEV;
4919
			goto errout;
L
Linus Torvalds 已提交
4920
		}
4921 4922 4923 4924 4925 4926

		fl6.flowi6_iif = iif;

		if (!ipv6_addr_any(&fl6.saddr))
			flags |= RT6_LOOKUP_F_HAS_SADDR;

D
David Ahern 已提交
4927
		dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
4928 4929

		rcu_read_unlock();
4930 4931 4932
	} else {
		fl6.flowi6_oif = oif;

4933
		dst = ip6_route_output(net, NULL, &fl6);
4934 4935 4936 4937 4938 4939 4940 4941
	}


	rt = container_of(dst, struct rt6_info, dst);
	if (rt->dst.error) {
		err = rt->dst.error;
		ip6_rt_put(rt);
		goto errout;
L
Linus Torvalds 已提交
4942 4943
	}

4944 4945 4946 4947 4948 4949
	if (rt == net->ipv6.ip6_null_entry) {
		err = rt->dst.error;
		ip6_rt_put(rt);
		goto errout;
	}

4950
	skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
4951
	if (!skb) {
A
Amerigo Wang 已提交
4952
		ip6_rt_put(rt);
4953 4954 4955
		err = -ENOBUFS;
		goto errout;
	}
L
Linus Torvalds 已提交
4956

4957
	skb_dst_set(skb, &rt->dst);
4958 4959 4960 4961

	rcu_read_lock();
	from = rcu_dereference(rt->from);

4962
	if (fibmatch)
4963
		err = rt6_fill_node(net, skb, from, NULL, NULL, NULL, iif,
4964 4965 4966
				    RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
				    nlh->nlmsg_seq, 0);
	else
4967 4968
		err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
				    &fl6.saddr, iif, RTM_NEWROUTE,
4969 4970
				    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
				    0);
4971 4972
	rcu_read_unlock();

L
Linus Torvalds 已提交
4973
	if (err < 0) {
4974 4975
		kfree_skb(skb);
		goto errout;
L
Linus Torvalds 已提交
4976 4977
	}

4978
	err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
4979
errout:
L
Linus Torvalds 已提交
4980 4981 4982
	return err;
}

4983
void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
4984
		     unsigned int nlm_flags)
L
Linus Torvalds 已提交
4985 4986
{
	struct sk_buff *skb;
4987
	struct net *net = info->nl_net;
4988 4989 4990 4991
	u32 seq;
	int err;

	err = -ENOBUFS;
4992
	seq = info->nlh ? info->nlh->nlmsg_seq : 0;
4993

4994
	skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
4995
	if (!skb)
4996 4997
		goto errout;

4998 4999
	err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
			    event, info->portid, seq, nlm_flags);
5000 5001 5002 5003 5004 5005
	if (err < 0) {
		/* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
		WARN_ON(err == -EMSGSIZE);
		kfree_skb(skb);
		goto errout;
	}
5006
	rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5007 5008
		    info->nlh, gfp_any());
	return;
5009 5010
errout:
	if (err < 0)
5011
		rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
L
Linus Torvalds 已提交
5012 5013
}

5014
static int ip6_route_dev_notify(struct notifier_block *this,
5015
				unsigned long event, void *ptr)
5016
{
5017
	struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5018
	struct net *net = dev_net(dev);
5019

5020 5021 5022 5023
	if (!(dev->flags & IFF_LOOPBACK))
		return NOTIFY_OK;

	if (event == NETDEV_REGISTER) {
D
David Ahern 已提交
5024
		net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5025
		net->ipv6.ip6_null_entry->dst.dev = dev;
5026 5027
		net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5028
		net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5029
		net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5030
		net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5031
		net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5032
#endif
5033 5034 5035 5036 5037
	 } else if (event == NETDEV_UNREGISTER &&
		    dev->reg_state != NETREG_UNREGISTERED) {
		/* NETDEV_UNREGISTER could be fired for multiple times by
		 * netdev_wait_allrefs(). Make sure we only call this once.
		 */
5038
		in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5039
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5040 5041
		in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
		in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5042 5043 5044 5045 5046 5047
#endif
	}

	return NOTIFY_OK;
}

L
Linus Torvalds 已提交
5048 5049 5050 5051 5052 5053 5054
/*
 *	/proc
 */

#ifdef CONFIG_PROC_FS
static int rt6_stats_seq_show(struct seq_file *seq, void *v)
{
5055
	struct net *net = (struct net *)seq->private;
L
Linus Torvalds 已提交
5056
	seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5057 5058
		   net->ipv6.rt6_stats->fib_nodes,
		   net->ipv6.rt6_stats->fib_route_nodes,
W
Wei Wang 已提交
5059
		   atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5060 5061
		   net->ipv6.rt6_stats->fib_rt_entries,
		   net->ipv6.rt6_stats->fib_rt_cache,
5062
		   dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5063
		   net->ipv6.rt6_stats->fib_discarded_routes);
L
Linus Torvalds 已提交
5064 5065 5066 5067 5068 5069 5070 5071

	return 0;
}
#endif	/* CONFIG_PROC_FS */

#ifdef CONFIG_SYSCTL

static
5072
int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
L
Linus Torvalds 已提交
5073 5074
			      void __user *buffer, size_t *lenp, loff_t *ppos)
{
5075 5076 5077
	struct net *net;
	int delay;
	if (!write)
L
Linus Torvalds 已提交
5078
		return -EINVAL;
5079 5080 5081 5082

	net = (struct net *)ctl->extra1;
	delay = net->ipv6.sysctl.flush_delay;
	proc_dointvec(ctl, write, buffer, lenp, ppos);
5083
	fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5084
	return 0;
L
Linus Torvalds 已提交
5085 5086
}

5087
struct ctl_table ipv6_route_table_template[] = {
5088
	{
L
Linus Torvalds 已提交
5089
		.procname	=	"flush",
5090
		.data		=	&init_net.ipv6.sysctl.flush_delay,
L
Linus Torvalds 已提交
5091
		.maxlen		=	sizeof(int),
5092
		.mode		=	0200,
A
Alexey Dobriyan 已提交
5093
		.proc_handler	=	ipv6_sysctl_rtcache_flush
L
Linus Torvalds 已提交
5094 5095 5096
	},
	{
		.procname	=	"gc_thresh",
5097
		.data		=	&ip6_dst_ops_template.gc_thresh,
L
Linus Torvalds 已提交
5098 5099
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5100
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5101 5102 5103
	},
	{
		.procname	=	"max_size",
5104
		.data		=	&init_net.ipv6.sysctl.ip6_rt_max_size,
L
Linus Torvalds 已提交
5105 5106
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5107
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5108 5109 5110
	},
	{
		.procname	=	"gc_min_interval",
5111
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
5112 5113
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5114
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5115 5116 5117
	},
	{
		.procname	=	"gc_timeout",
5118
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_timeout,
L
Linus Torvalds 已提交
5119 5120
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5121
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5122 5123 5124
	},
	{
		.procname	=	"gc_interval",
5125
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_interval,
L
Linus Torvalds 已提交
5126 5127
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5128
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5129 5130 5131
	},
	{
		.procname	=	"gc_elasticity",
5132
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
L
Linus Torvalds 已提交
5133 5134
		.maxlen		=	sizeof(int),
		.mode		=	0644,
5135
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5136 5137 5138
	},
	{
		.procname	=	"mtu_expires",
5139
		.data		=	&init_net.ipv6.sysctl.ip6_rt_mtu_expires,
L
Linus Torvalds 已提交
5140 5141
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5142
		.proc_handler	=	proc_dointvec_jiffies,
L
Linus Torvalds 已提交
5143 5144 5145
	},
	{
		.procname	=	"min_adv_mss",
5146
		.data		=	&init_net.ipv6.sysctl.ip6_rt_min_advmss,
L
Linus Torvalds 已提交
5147 5148
		.maxlen		=	sizeof(int),
		.mode		=	0644,
5149
		.proc_handler	=	proc_dointvec,
L
Linus Torvalds 已提交
5150 5151 5152
	},
	{
		.procname	=	"gc_min_interval_ms",
5153
		.data		=	&init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
L
Linus Torvalds 已提交
5154 5155
		.maxlen		=	sizeof(int),
		.mode		=	0644,
A
Alexey Dobriyan 已提交
5156
		.proc_handler	=	proc_dointvec_ms_jiffies,
L
Linus Torvalds 已提交
5157
	},
5158
	{ }
L
Linus Torvalds 已提交
5159 5160
};

5161
struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5162 5163 5164 5165 5166 5167
{
	struct ctl_table *table;

	table = kmemdup(ipv6_route_table_template,
			sizeof(ipv6_route_table_template),
			GFP_KERNEL);
5168 5169 5170

	if (table) {
		table[0].data = &net->ipv6.sysctl.flush_delay;
5171
		table[0].extra1 = net;
5172
		table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5173 5174 5175 5176 5177 5178 5179
		table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
		table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
		table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
		table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
		table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
		table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
		table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5180
		table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5181 5182 5183 5184

		/* Don't export sysctls to unprivileged users */
		if (net->user_ns != &init_user_ns)
			table[0].procname = NULL;
5185 5186
	}

5187 5188
	return table;
}
L
Linus Torvalds 已提交
5189 5190
#endif

5191
static int __net_init ip6_route_net_init(struct net *net)
5192
{
5193
	int ret = -ENOMEM;
5194

5195 5196
	memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
	       sizeof(net->ipv6.ip6_dst_ops));
5197

5198 5199 5200
	if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
		goto out_ip6_dst_ops;

D
David Ahern 已提交
5201 5202 5203 5204 5205 5206
	net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
					    sizeof(*net->ipv6.fib6_null_entry),
					    GFP_KERNEL);
	if (!net->ipv6.fib6_null_entry)
		goto out_ip6_dst_entries;

5207 5208 5209 5210
	net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
					   sizeof(*net->ipv6.ip6_null_entry),
					   GFP_KERNEL);
	if (!net->ipv6.ip6_null_entry)
D
David Ahern 已提交
5211
		goto out_fib6_null_entry;
5212
	net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5213 5214
	dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
			 ip6_template_metrics, true);
5215 5216

#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5217
	net->ipv6.fib6_has_custom_rules = false;
5218 5219 5220
	net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
					       sizeof(*net->ipv6.ip6_prohibit_entry),
					       GFP_KERNEL);
5221 5222
	if (!net->ipv6.ip6_prohibit_entry)
		goto out_ip6_null_entry;
5223
	net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5224 5225
	dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
			 ip6_template_metrics, true);
5226 5227 5228 5229

	net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
					       sizeof(*net->ipv6.ip6_blk_hole_entry),
					       GFP_KERNEL);
5230 5231
	if (!net->ipv6.ip6_blk_hole_entry)
		goto out_ip6_prohibit_entry;
5232
	net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5233 5234
	dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
			 ip6_template_metrics, true);
5235 5236
#endif

5237 5238 5239 5240 5241 5242 5243 5244 5245
	net->ipv6.sysctl.flush_delay = 0;
	net->ipv6.sysctl.ip6_rt_max_size = 4096;
	net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
	net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
	net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
	net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
	net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
	net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;

5246 5247
	net->ipv6.ip6_rt_gc_expire = 30*HZ;

5248 5249 5250
	ret = 0;
out:
	return ret;
5251

5252 5253 5254 5255 5256 5257
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
out_ip6_prohibit_entry:
	kfree(net->ipv6.ip6_prohibit_entry);
out_ip6_null_entry:
	kfree(net->ipv6.ip6_null_entry);
#endif
D
David Ahern 已提交
5258 5259
out_fib6_null_entry:
	kfree(net->ipv6.fib6_null_entry);
5260 5261
out_ip6_dst_entries:
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5262 5263
out_ip6_dst_ops:
	goto out;
5264 5265
}

5266
static void __net_exit ip6_route_net_exit(struct net *net)
5267
{
D
David Ahern 已提交
5268
	kfree(net->ipv6.fib6_null_entry);
5269 5270 5271 5272 5273
	kfree(net->ipv6.ip6_null_entry);
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
	kfree(net->ipv6.ip6_prohibit_entry);
	kfree(net->ipv6.ip6_blk_hole_entry);
#endif
5274
	dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5275 5276
}

5277 5278 5279
static int __net_init ip6_route_net_init_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
5280 5281
	proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
			sizeof(struct ipv6_route_iter));
5282 5283
	proc_create_net_single("rt6_stats", 0444, net->proc_net,
			rt6_stats_seq_show, NULL);
5284 5285 5286 5287 5288 5289 5290
#endif
	return 0;
}

static void __net_exit ip6_route_net_exit_late(struct net *net)
{
#ifdef CONFIG_PROC_FS
5291 5292
	remove_proc_entry("ipv6_route", net->proc_net);
	remove_proc_entry("rt6_stats", net->proc_net);
5293 5294 5295
#endif
}

5296 5297 5298 5299 5300
static struct pernet_operations ip6_route_net_ops = {
	.init = ip6_route_net_init,
	.exit = ip6_route_net_exit,
};

5301 5302 5303 5304 5305 5306 5307 5308 5309 5310 5311 5312 5313 5314 5315 5316
static int __net_init ipv6_inetpeer_init(struct net *net)
{
	struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);

	if (!bp)
		return -ENOMEM;
	inet_peer_base_init(bp);
	net->ipv6.peers = bp;
	return 0;
}

static void __net_exit ipv6_inetpeer_exit(struct net *net)
{
	struct inet_peer_base *bp = net->ipv6.peers;

	net->ipv6.peers = NULL;
5317
	inetpeer_invalidate_tree(bp);
5318 5319 5320
	kfree(bp);
}

5321
static struct pernet_operations ipv6_inetpeer_ops = {
5322 5323 5324 5325
	.init	=	ipv6_inetpeer_init,
	.exit	=	ipv6_inetpeer_exit,
};

5326 5327 5328 5329 5330
static struct pernet_operations ip6_route_net_late_ops = {
	.init = ip6_route_net_init_late,
	.exit = ip6_route_net_exit_late,
};

5331 5332
static struct notifier_block ip6_route_dev_notifier = {
	.notifier_call = ip6_route_dev_notify,
5333
	.priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5334 5335
};

5336 5337 5338 5339 5340
void __init ip6_route_init_special_entries(void)
{
	/* Registering of the loopback is done before this portion of code,
	 * the loopback reference in rt6_info will not be taken, do it
	 * manually for init_net */
D
David Ahern 已提交
5341
	init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5342 5343 5344 5345 5346 5347 5348 5349 5350 5351
	init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #ifdef CONFIG_IPV6_MULTIPLE_TABLES
	init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
	init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
	init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
  #endif
}

5352
int __init ip6_route_init(void)
L
Linus Torvalds 已提交
5353
{
5354
	int ret;
5355
	int cpu;
5356

5357 5358
	ret = -ENOMEM;
	ip6_dst_ops_template.kmem_cachep =
A
Alexey Dobriyan 已提交
5359
		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5360
				  SLAB_HWCACHE_ALIGN, NULL);
5361
	if (!ip6_dst_ops_template.kmem_cachep)
5362
		goto out;
5363

5364
	ret = dst_entries_init(&ip6_dst_blackhole_ops);
5365
	if (ret)
5366 5367
		goto out_kmem_cache;

5368 5369
	ret = register_pernet_subsys(&ipv6_inetpeer_ops);
	if (ret)
5370
		goto out_dst_entries;
5371

5372 5373 5374
	ret = register_pernet_subsys(&ip6_route_net_ops);
	if (ret)
		goto out_register_inetpeer;
5375

5376 5377
	ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;

5378
	ret = fib6_init();
5379
	if (ret)
5380
		goto out_register_subsys;
5381 5382 5383

	ret = xfrm6_init();
	if (ret)
5384
		goto out_fib6_init;
5385

5386 5387 5388
	ret = fib6_rules_init();
	if (ret)
		goto xfrm6_init;
5389

5390 5391 5392 5393
	ret = register_pernet_subsys(&ip6_route_net_late_ops);
	if (ret)
		goto fib6_rules_init;

5394 5395 5396 5397 5398 5399 5400 5401 5402 5403 5404 5405 5406 5407
	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
				   inet6_rtm_newroute, NULL, 0);
	if (ret < 0)
		goto out_register_late_subsys;

	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
				   inet6_rtm_delroute, NULL, 0);
	if (ret < 0)
		goto out_register_late_subsys;

	ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
				   inet6_rtm_getroute, NULL,
				   RTNL_FLAG_DOIT_UNLOCKED);
	if (ret < 0)
5408
		goto out_register_late_subsys;
5409

5410
	ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5411
	if (ret)
5412
		goto out_register_late_subsys;
5413

5414 5415 5416 5417 5418 5419 5420
	for_each_possible_cpu(cpu) {
		struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);

		INIT_LIST_HEAD(&ul->head);
		spin_lock_init(&ul->lock);
	}

5421 5422 5423
out:
	return ret;

5424
out_register_late_subsys:
5425
	rtnl_unregister_all(PF_INET6);
5426
	unregister_pernet_subsys(&ip6_route_net_late_ops);
5427 5428 5429 5430
fib6_rules_init:
	fib6_rules_cleanup();
xfrm6_init:
	xfrm6_fini();
5431 5432
out_fib6_init:
	fib6_gc_cleanup();
5433 5434
out_register_subsys:
	unregister_pernet_subsys(&ip6_route_net_ops);
5435 5436
out_register_inetpeer:
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5437 5438
out_dst_entries:
	dst_entries_destroy(&ip6_dst_blackhole_ops);
5439
out_kmem_cache:
5440
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5441
	goto out;
L
Linus Torvalds 已提交
5442 5443 5444 5445
}

void ip6_route_cleanup(void)
{
5446
	unregister_netdevice_notifier(&ip6_route_dev_notifier);
5447
	unregister_pernet_subsys(&ip6_route_net_late_ops);
T
Thomas Graf 已提交
5448
	fib6_rules_cleanup();
L
Linus Torvalds 已提交
5449 5450
	xfrm6_fini();
	fib6_gc_cleanup();
5451
	unregister_pernet_subsys(&ipv6_inetpeer_ops);
5452
	unregister_pernet_subsys(&ip6_route_net_ops);
5453
	dst_entries_destroy(&ip6_dst_blackhole_ops);
5454
	kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
L
Linus Torvalds 已提交
5455
}